From 86dd15d5873a12ceb80c1bf0b9e21467e289b5e5 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Mon, 21 Jun 2021 11:25:34 -0400 Subject: [PATCH] devices: externalize nvidia device driver --- .circleci/config.yml | 6 +- devices/gpu/nvidia/README.md | 21 - devices/gpu/nvidia/cmd/main.go | 20 - devices/gpu/nvidia/device.go | 228 -- devices/gpu/nvidia/device_test.go | 140 - devices/gpu/nvidia/fingerprint.go | 229 -- devices/gpu/nvidia/fingerprint_test.go | 1361 -------- devices/gpu/nvidia/nvml/client.go | 194 -- devices/gpu/nvidia/nvml/client_test.go | 399 --- devices/gpu/nvidia/nvml/driver_default.go | 33 - devices/gpu/nvidia/nvml/driver_linux.go | 85 - devices/gpu/nvidia/nvml/shared.go | 61 - devices/gpu/nvidia/stats.go | 325 -- devices/gpu/nvidia/stats_test.go | 3041 ----------------- go.mod | 2 +- go.sum | 5 +- .../catalog/register_nvidia_linux.go | 14 - .../content/docs/devices/external/index.mdx | 31 +- .../docs/devices/{ => external}/nvidia.mdx | 0 website/content/docs/devices/index.mdx | 15 +- website/data/docs-nav-data.json | 10 +- 21 files changed, 29 insertions(+), 6191 deletions(-) delete mode 100644 devices/gpu/nvidia/README.md delete mode 100644 devices/gpu/nvidia/cmd/main.go delete mode 100644 devices/gpu/nvidia/device.go delete mode 100644 devices/gpu/nvidia/device_test.go delete mode 100644 devices/gpu/nvidia/fingerprint.go delete mode 100644 devices/gpu/nvidia/fingerprint_test.go delete mode 100644 devices/gpu/nvidia/nvml/client.go delete mode 100644 devices/gpu/nvidia/nvml/client_test.go delete mode 100644 devices/gpu/nvidia/nvml/driver_default.go delete mode 100644 devices/gpu/nvidia/nvml/driver_linux.go delete mode 100644 devices/gpu/nvidia/nvml/shared.go delete mode 100644 devices/gpu/nvidia/stats.go delete mode 100644 devices/gpu/nvidia/stats_test.go delete mode 100644 helper/pluginutils/catalog/register_nvidia_linux.go rename website/content/docs/devices/{ => external}/nvidia.mdx (100%) diff --git a/.circleci/config.yml b/.circleci/config.yml index 1b8b25bd6f6..2dd81ad025d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -625,13 +625,9 @@ workflows: test_module: "api" filters: *backend_test_branches_filter enable_race_testing: true - - test-container: - name: "test-devices" - test_packages: "./devices/..." - filters: *backend_test_branches_filter - test-machine: name: "test-other" - exclude_packages: "./api|./client|./drivers/docker|./drivers/exec|./drivers/shared/executor|./nomad|./devices|./e2e" + exclude_packages: "./api|./client|./drivers/docker|./drivers/exec|./drivers/shared/executor|./nomad|./e2e" filters: *backend_test_branches_filter - test-machine: name: "test-docker" diff --git a/devices/gpu/nvidia/README.md b/devices/gpu/nvidia/README.md deleted file mode 100644 index 1035c7c8940..00000000000 --- a/devices/gpu/nvidia/README.md +++ /dev/null @@ -1,21 +0,0 @@ -This package provides an implementation of nvidia device plugin - -# Behavior - -Nvidia device plugin uses NVML bindings to get data regarding available nvidia devices and will expose them via Fingerprint RPC. GPUs can be excluded from fingerprinting by setting the `ignored_gpu_ids` field. Plugin sends statistics for fingerprinted devices every `stats_period` period. - -# Config - -The configuration should be passed via an HCL file that begins with a top level `config` stanza: - -``` -config { - ignored_gpu_ids = ["uuid1", "uuid2"] - fingerprint_period = "5s" -} -``` - -The valid configuration options are: - -* `ignored_gpu_ids` (`list(string)`: `[]`): list of GPU UUIDs strings that should not be exposed to nomad -* `fingerprint_period` (`string`: `"1m"`): interval to repeat the fingerprint process to identify possible changes. diff --git a/devices/gpu/nvidia/cmd/main.go b/devices/gpu/nvidia/cmd/main.go deleted file mode 100644 index 5c0bea6c4d8..00000000000 --- a/devices/gpu/nvidia/cmd/main.go +++ /dev/null @@ -1,20 +0,0 @@ -package main - -import ( - "context" - - log "github.com/hashicorp/go-hclog" - - "github.com/hashicorp/nomad/devices/gpu/nvidia" - "github.com/hashicorp/nomad/plugins" -) - -func main() { - // Serve the plugin - plugins.ServeCtx(factory) -} - -// factory returns a new instance of the Nvidia GPU plugin -func factory(ctx context.Context, log log.Logger) interface{} { - return nvidia.NewNvidiaDevice(ctx, log) -} diff --git a/devices/gpu/nvidia/device.go b/devices/gpu/nvidia/device.go deleted file mode 100644 index 67680dc2a0e..00000000000 --- a/devices/gpu/nvidia/device.go +++ /dev/null @@ -1,228 +0,0 @@ -package nvidia - -import ( - "context" - "fmt" - "strings" - "sync" - "time" - - log "github.com/hashicorp/go-hclog" - "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" - "github.com/hashicorp/nomad/helper/pluginutils/loader" - "github.com/hashicorp/nomad/plugins/base" - "github.com/hashicorp/nomad/plugins/device" - "github.com/hashicorp/nomad/plugins/shared/hclspec" -) - -const ( - // pluginName is the name of the plugin - pluginName = "nvidia-gpu" - - // vendor is the vendor providing the devices - vendor = "nvidia" - - // deviceType is the type of device being returned - deviceType = device.DeviceTypeGPU - - // notAvailable value is returned to nomad server in case some properties were - // undetected by nvml driver - notAvailable = "N/A" - - // Nvidia-container-runtime environment variable names - NvidiaVisibleDevices = "NVIDIA_VISIBLE_DEVICES" -) - -var ( - // PluginID is the nvidia plugin metadata registered in the plugin - // catalog. - PluginID = loader.PluginID{ - Name: pluginName, - PluginType: base.PluginTypeDevice, - } - - // PluginConfig is the nvidia factory function registered in the - // plugin catalog. - PluginConfig = &loader.InternalPluginConfig{ - Factory: func(ctx context.Context, l log.Logger) interface{} { return NewNvidiaDevice(ctx, l) }, - } - - // pluginInfo describes the plugin - pluginInfo = &base.PluginInfoResponse{ - Type: base.PluginTypeDevice, - PluginApiVersions: []string{device.ApiVersion010}, - PluginVersion: "0.1.0", - Name: pluginName, - } - - // configSpec is the specification of the plugin's configuration - configSpec = hclspec.NewObject(map[string]*hclspec.Spec{ - "enabled": hclspec.NewDefault( - hclspec.NewAttr("enabled", "bool", false), - hclspec.NewLiteral("true"), - ), - "ignored_gpu_ids": hclspec.NewDefault( - hclspec.NewAttr("ignored_gpu_ids", "list(string)", false), - hclspec.NewLiteral("[]"), - ), - "fingerprint_period": hclspec.NewDefault( - hclspec.NewAttr("fingerprint_period", "string", false), - hclspec.NewLiteral("\"1m\""), - ), - }) -) - -// Config contains configuration information for the plugin. -type Config struct { - Enabled bool `codec:"enabled"` - IgnoredGPUIDs []string `codec:"ignored_gpu_ids"` - FingerprintPeriod string `codec:"fingerprint_period"` -} - -// NvidiaDevice contains all plugin specific data -type NvidiaDevice struct { - // enabled indicates whether the plugin should be enabled - enabled bool - - // nvmlClient is used to get data from nvidia - nvmlClient nvml.NvmlClient - - // initErr holds an error retrieved during - // nvmlClient initialization - initErr error - - // ignoredGPUIDs is a set of UUIDs that would not be exposed to nomad - ignoredGPUIDs map[string]struct{} - - // fingerprintPeriod is how often we should call nvml to get list of devices - fingerprintPeriod time.Duration - - // devices is the set of detected eligible devices - devices map[string]struct{} - deviceLock sync.RWMutex - - logger log.Logger -} - -// NewNvidiaDevice returns a new nvidia device plugin. -func NewNvidiaDevice(_ context.Context, log log.Logger) *NvidiaDevice { - nvmlClient, err := nvml.NewNvmlClient() - logger := log.Named(pluginName) - if err != nil && err.Error() != nvml.UnavailableLib.Error() { - logger.Error("unable to initialize Nvidia driver", "reason", err) - } - return &NvidiaDevice{ - logger: logger, - devices: make(map[string]struct{}), - ignoredGPUIDs: make(map[string]struct{}), - nvmlClient: nvmlClient, - initErr: err, - } -} - -// PluginInfo returns information describing the plugin. -func (d *NvidiaDevice) PluginInfo() (*base.PluginInfoResponse, error) { - return pluginInfo, nil -} - -// ConfigSchema returns the plugins configuration schema. -func (d *NvidiaDevice) ConfigSchema() (*hclspec.Spec, error) { - return configSpec, nil -} - -// SetConfig is used to set the configuration of the plugin. -func (d *NvidiaDevice) SetConfig(cfg *base.Config) error { - var config Config - if len(cfg.PluginConfig) != 0 { - if err := base.MsgPackDecode(cfg.PluginConfig, &config); err != nil { - return err - } - } - - d.enabled = config.Enabled - - for _, ignoredGPUId := range config.IgnoredGPUIDs { - d.ignoredGPUIDs[ignoredGPUId] = struct{}{} - } - - period, err := time.ParseDuration(config.FingerprintPeriod) - if err != nil { - return fmt.Errorf("failed to parse fingerprint period %q: %v", config.FingerprintPeriod, err) - } - d.fingerprintPeriod = period - - return nil -} - -// Fingerprint streams detected devices. If device changes are detected or the -// devices health changes, messages will be emitted. -func (d *NvidiaDevice) Fingerprint(ctx context.Context) (<-chan *device.FingerprintResponse, error) { - if !d.enabled { - return nil, device.ErrPluginDisabled - } - - outCh := make(chan *device.FingerprintResponse) - go d.fingerprint(ctx, outCh) - return outCh, nil -} - -type reservationError struct { - notExistingIDs []string -} - -func (e *reservationError) Error() string { - return fmt.Sprintf("unknown device IDs: %s", strings.Join(e.notExistingIDs, ",")) -} - -// Reserve returns information on how to mount given devices. -// Assumption is made that nomad server is responsible for correctness of -// GPU allocations, handling tricky cases such as double-allocation of single GPU -func (d *NvidiaDevice) Reserve(deviceIDs []string) (*device.ContainerReservation, error) { - if len(deviceIDs) == 0 { - return &device.ContainerReservation{}, nil - } - if !d.enabled { - return nil, device.ErrPluginDisabled - } - - // Due to the asynchronous nature of NvidiaPlugin, there is a possibility - // of race condition - // - // Timeline: - // 1 - fingerprint reports that GPU with id "1" is present - // 2 - the following events happen at the same time: - // a) server decides to allocate GPU with id "1" - // b) fingerprint check reports that GPU with id "1" is no more present - // - // The latest and always valid version of fingerprinted ids are stored in - // d.devices map. To avoid this race condition an error is returned if - // any of provided deviceIDs is not found in d.devices map - d.deviceLock.RLock() - var notExistingIDs []string - for _, id := range deviceIDs { - if _, deviceIDExists := d.devices[id]; !deviceIDExists { - notExistingIDs = append(notExistingIDs, id) - } - } - d.deviceLock.RUnlock() - if len(notExistingIDs) != 0 { - return nil, &reservationError{notExistingIDs} - } - - return &device.ContainerReservation{ - Envs: map[string]string{ - NvidiaVisibleDevices: strings.Join(deviceIDs, ","), - }, - }, nil -} - -// Stats streams statistics for the detected devices. -func (d *NvidiaDevice) Stats(ctx context.Context, interval time.Duration) (<-chan *device.StatsResponse, error) { - if !d.enabled { - return nil, device.ErrPluginDisabled - } - - outCh := make(chan *device.StatsResponse) - go d.stats(ctx, outCh, interval) - return outCh, nil -} diff --git a/devices/gpu/nvidia/device_test.go b/devices/gpu/nvidia/device_test.go deleted file mode 100644 index a5ec354e243..00000000000 --- a/devices/gpu/nvidia/device_test.go +++ /dev/null @@ -1,140 +0,0 @@ -package nvidia - -import ( - "testing" - - hclog "github.com/hashicorp/go-hclog" - "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" - "github.com/hashicorp/nomad/plugins/device" - "github.com/stretchr/testify/require" -) - -type MockNvmlClient struct { - FingerprintError error - FingerprintResponseReturned *nvml.FingerprintData - - StatsError error - StatsResponseReturned []*nvml.StatsData -} - -func (c *MockNvmlClient) GetFingerprintData() (*nvml.FingerprintData, error) { - return c.FingerprintResponseReturned, c.FingerprintError -} - -func (c *MockNvmlClient) GetStatsData() ([]*nvml.StatsData, error) { - return c.StatsResponseReturned, c.StatsError -} - -func TestReserve(t *testing.T) { - cases := []struct { - Name string - ExpectedReservation *device.ContainerReservation - ExpectedError error - Device *NvidiaDevice - RequestedIDs []string - }{ - { - Name: "All RequestedIDs are not managed by Device", - ExpectedReservation: nil, - ExpectedError: &reservationError{[]string{ - "UUID1", - "UUID2", - "UUID3", - }}, - RequestedIDs: []string{ - "UUID1", - "UUID2", - "UUID3", - }, - Device: &NvidiaDevice{ - logger: hclog.NewNullLogger(), - enabled: true, - }, - }, - { - Name: "Some RequestedIDs are not managed by Device", - ExpectedReservation: nil, - ExpectedError: &reservationError{[]string{ - "UUID1", - "UUID2", - }}, - RequestedIDs: []string{ - "UUID1", - "UUID2", - "UUID3", - }, - Device: &NvidiaDevice{ - devices: map[string]struct{}{ - "UUID3": {}, - }, - logger: hclog.NewNullLogger(), - enabled: true, - }, - }, - { - Name: "All RequestedIDs are managed by Device", - ExpectedReservation: &device.ContainerReservation{ - Envs: map[string]string{ - NvidiaVisibleDevices: "UUID1,UUID2,UUID3", - }, - }, - ExpectedError: nil, - RequestedIDs: []string{ - "UUID1", - "UUID2", - "UUID3", - }, - Device: &NvidiaDevice{ - devices: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - logger: hclog.NewNullLogger(), - enabled: true, - }, - }, - { - Name: "No IDs requested", - ExpectedReservation: &device.ContainerReservation{}, - ExpectedError: nil, - RequestedIDs: nil, - Device: &NvidiaDevice{ - devices: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - logger: hclog.NewNullLogger(), - enabled: true, - }, - }, - { - Name: "Device is disabled", - ExpectedReservation: nil, - ExpectedError: device.ErrPluginDisabled, - RequestedIDs: []string{ - "UUID1", - "UUID2", - "UUID3", - }, - Device: &NvidiaDevice{ - devices: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - logger: hclog.NewNullLogger(), - enabled: false, - }, - }, - } - - for _, c := range cases { - t.Run(c.Name, func(t *testing.T) { - actualReservation, actualError := c.Device.Reserve(c.RequestedIDs) - require.Equal(t, c.ExpectedReservation, actualReservation) - require.Equal(t, c.ExpectedError, actualError) - }) - } -} diff --git a/devices/gpu/nvidia/fingerprint.go b/devices/gpu/nvidia/fingerprint.go deleted file mode 100644 index 45bb34fa335..00000000000 --- a/devices/gpu/nvidia/fingerprint.go +++ /dev/null @@ -1,229 +0,0 @@ -package nvidia - -import ( - "context" - "time" - - "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" - "github.com/hashicorp/nomad/helper" - "github.com/hashicorp/nomad/plugins/device" - "github.com/hashicorp/nomad/plugins/shared/structs" -) - -const ( - // Attribute names and units for reporting Fingerprint output - MemoryAttr = "memory" - PowerAttr = "power" - BAR1Attr = "bar1" - DriverVersionAttr = "driver_version" - CoresClockAttr = "cores_clock" - MemoryClockAttr = "memory_clock" - PCIBandwidthAttr = "pci_bandwidth" - DisplayStateAttr = "display_state" - PersistenceModeAttr = "persistence_mode" -) - -// fingerprint is the long running goroutine that detects hardware -func (d *NvidiaDevice) fingerprint(ctx context.Context, devices chan<- *device.FingerprintResponse) { - defer close(devices) - - if d.initErr != nil { - if d.initErr.Error() != nvml.UnavailableLib.Error() { - d.logger.Error("exiting fingerprinting due to problems with NVML loading", "error", d.initErr) - devices <- device.NewFingerprintError(d.initErr) - } - - // Just close the channel to let server know that there are no working - // Nvidia GPU units - return - } - - // Create a timer that will fire immediately for the first detection - ticker := time.NewTimer(0) - - for { - select { - case <-ctx.Done(): - return - case <-ticker.C: - ticker.Reset(d.fingerprintPeriod) - } - d.writeFingerprintToChannel(devices) - } -} - -// writeFingerprintToChannel makes nvml call and writes response to channel -func (d *NvidiaDevice) writeFingerprintToChannel(devices chan<- *device.FingerprintResponse) { - fingerprintData, err := d.nvmlClient.GetFingerprintData() - if err != nil { - d.logger.Error("failed to get fingerprint nvidia devices", "error", err) - devices <- device.NewFingerprintError(err) - return - } - - // ignore devices from fingerprint output - fingerprintDevices := ignoreFingerprintedDevices(fingerprintData.Devices, d.ignoredGPUIDs) - // check if any device health was updated or any device was added to host - if !d.fingerprintChanged(fingerprintDevices) { - return - } - - commonAttributes := map[string]*structs.Attribute{ - DriverVersionAttr: { - String: helper.StringToPtr(fingerprintData.DriverVersion), - }, - } - - // Group all FingerprintDevices by DeviceName attribute - deviceListByDeviceName := make(map[string][]*nvml.FingerprintDeviceData) - for _, device := range fingerprintDevices { - deviceName := device.DeviceName - if deviceName == nil { - // nvml driver was not able to detect device name. This kind - // of devices are placed to single group with 'notAvailable' name - notAvailableCopy := notAvailable - deviceName = ¬AvailableCopy - } - - deviceListByDeviceName[*deviceName] = append(deviceListByDeviceName[*deviceName], device) - } - - // Build Fingerprint response with computed groups and send it over the channel - deviceGroups := make([]*device.DeviceGroup, 0, len(deviceListByDeviceName)) - for groupName, devices := range deviceListByDeviceName { - deviceGroups = append(deviceGroups, deviceGroupFromFingerprintData(groupName, devices, commonAttributes)) - } - devices <- device.NewFingerprint(deviceGroups...) -} - -// ignoreFingerprintedDevices excludes ignored devices from fingerprint output -func ignoreFingerprintedDevices(deviceData []*nvml.FingerprintDeviceData, ignoredGPUIDs map[string]struct{}) []*nvml.FingerprintDeviceData { - var result []*nvml.FingerprintDeviceData - for _, fingerprintDevice := range deviceData { - if _, ignored := ignoredGPUIDs[fingerprintDevice.UUID]; !ignored { - result = append(result, fingerprintDevice) - } - } - return result -} - -// fingerprintChanged checks if there are any previously unseen nvidia devices located -// or any of fingerprinted nvidia devices disappeared since the last fingerprint run. -// Also, this func updates device map on NvidiaDevice with the latest data -func (d *NvidiaDevice) fingerprintChanged(allDevices []*nvml.FingerprintDeviceData) bool { - d.deviceLock.Lock() - defer d.deviceLock.Unlock() - - changeDetected := false - // check if every device in allDevices is in d.devices - for _, device := range allDevices { - if _, ok := d.devices[device.UUID]; !ok { - changeDetected = true - } - } - - // check if every device in d.devices is in allDevices - fingerprintDeviceMap := make(map[string]struct{}) - for _, device := range allDevices { - fingerprintDeviceMap[device.UUID] = struct{}{} - } - for id := range d.devices { - if _, ok := fingerprintDeviceMap[id]; !ok { - changeDetected = true - } - } - - d.devices = fingerprintDeviceMap - return changeDetected -} - -// deviceGroupFromFingerprintData composes deviceGroup from FingerprintDeviceData slice -func deviceGroupFromFingerprintData(groupName string, deviceList []*nvml.FingerprintDeviceData, commonAttributes map[string]*structs.Attribute) *device.DeviceGroup { - // deviceGroup without devices makes no sense -> return nil when no devices are provided - if len(deviceList) == 0 { - return nil - } - - devices := make([]*device.Device, len(deviceList)) - for index, dev := range deviceList { - devices[index] = &device.Device{ - ID: dev.UUID, - // all fingerprinted devices are "healthy" for now - // to get real health data -> dcgm bindings should be used - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: dev.PCIBusID, - }, - } - } - - deviceGroup := &device.DeviceGroup{ - Vendor: vendor, - Type: deviceType, - Name: groupName, - Devices: devices, - // Assumption made that devices with the same DeviceName have the same - // attributes like amount of memory, power, bar1memory etc - Attributes: attributesFromFingerprintDeviceData(deviceList[0]), - } - - // Extend attribute map with common attributes - for attributeKey, attributeValue := range commonAttributes { - deviceGroup.Attributes[attributeKey] = attributeValue - } - - return deviceGroup -} - -// attributesFromFingerprintDeviceData converts nvml.FingerprintDeviceData -// struct to device.DeviceGroup.Attributes format (map[string]string) -// this function performs all nil checks for FingerprintDeviceData pointers -func attributesFromFingerprintDeviceData(d *nvml.FingerprintDeviceData) map[string]*structs.Attribute { - attrs := map[string]*structs.Attribute{ - DisplayStateAttr: { - String: helper.StringToPtr(d.DisplayState), - }, - PersistenceModeAttr: { - String: helper.StringToPtr(d.PersistenceMode), - }, - } - - if d.MemoryMiB != nil { - attrs[MemoryAttr] = &structs.Attribute{ - Int: helper.Int64ToPtr(int64(*d.MemoryMiB)), - Unit: structs.UnitMiB, - } - } - if d.PowerW != nil { - attrs[PowerAttr] = &structs.Attribute{ - Int: helper.Int64ToPtr(int64(*d.PowerW)), - Unit: structs.UnitW, - } - } - if d.BAR1MiB != nil { - attrs[BAR1Attr] = &structs.Attribute{ - Int: helper.Int64ToPtr(int64(*d.BAR1MiB)), - Unit: structs.UnitMiB, - } - } - if d.CoresClockMHz != nil { - attrs[CoresClockAttr] = &structs.Attribute{ - Int: helper.Int64ToPtr(int64(*d.CoresClockMHz)), - Unit: structs.UnitMHz, - } - } - if d.MemoryClockMHz != nil { - attrs[MemoryClockAttr] = &structs.Attribute{ - Int: helper.Int64ToPtr(int64(*d.MemoryClockMHz)), - Unit: structs.UnitMHz, - } - } - if d.PCIBandwidthMBPerS != nil { - attrs[PCIBandwidthAttr] = &structs.Attribute{ - Int: helper.Int64ToPtr(int64(*d.PCIBandwidthMBPerS)), - Unit: structs.UnitMBPerS, - } - } - - return attrs -} diff --git a/devices/gpu/nvidia/fingerprint_test.go b/devices/gpu/nvidia/fingerprint_test.go deleted file mode 100644 index c85b5c8c90a..00000000000 --- a/devices/gpu/nvidia/fingerprint_test.go +++ /dev/null @@ -1,1361 +0,0 @@ -package nvidia - -import ( - "context" - "errors" - "sort" - "testing" - - hclog "github.com/hashicorp/go-hclog" - "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" - "github.com/hashicorp/nomad/helper" - "github.com/hashicorp/nomad/plugins/device" - "github.com/hashicorp/nomad/plugins/shared/structs" - "github.com/stretchr/testify/require" -) - -func TestIgnoreFingerprintedDevices(t *testing.T) { - for _, testCase := range []struct { - Name string - DeviceData []*nvml.FingerprintDeviceData - IgnoredGPUIds map[string]struct{} - ExpectedResult []*nvml.FingerprintDeviceData - }{ - { - Name: "Odd ignored", - DeviceData: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName3"), - UUID: "UUID3", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - }, - IgnoredGPUIds: map[string]struct{}{ - "UUID2": {}, - }, - ExpectedResult: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName3"), - UUID: "UUID3", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - }, - }, - { - Name: "Even ignored", - DeviceData: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName3"), - UUID: "UUID3", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - }, - IgnoredGPUIds: map[string]struct{}{ - "UUID1": {}, - "UUID3": {}, - }, - ExpectedResult: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - }, - }, - { - Name: "All ignored", - DeviceData: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName3"), - UUID: "UUID3", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - }, - IgnoredGPUIds: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - ExpectedResult: nil, - }, - { - Name: "No ignored", - DeviceData: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName3"), - UUID: "UUID3", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - }, - IgnoredGPUIds: map[string]struct{}{}, - ExpectedResult: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName3"), - UUID: "UUID3", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - }, - }, - { - Name: "No DeviceData provided", - DeviceData: nil, - IgnoredGPUIds: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - ExpectedResult: nil, - }, - } { - t.Run(testCase.Name, func(t *testing.T) { - actualResult := ignoreFingerprintedDevices(testCase.DeviceData, testCase.IgnoredGPUIds) - require.New(t).Equal(testCase.ExpectedResult, actualResult) - }) - } -} - -func TestCheckFingerprintUpdates(t *testing.T) { - for _, testCase := range []struct { - Name string - Device *NvidiaDevice - AllDevices []*nvml.FingerprintDeviceData - DeviceMapAfterMethodCall map[string]struct{} - ExpectedResult bool - }{ - { - Name: "No updates", - Device: &NvidiaDevice{devices: map[string]struct{}{ - "1": {}, - "2": {}, - "3": {}, - }}, - AllDevices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "3", - }, - }, - }, - ExpectedResult: false, - DeviceMapAfterMethodCall: map[string]struct{}{ - "1": {}, - "2": {}, - "3": {}, - }, - }, - { - Name: "New Device Appeared", - Device: &NvidiaDevice{devices: map[string]struct{}{ - "1": {}, - "2": {}, - "3": {}, - }}, - AllDevices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "3", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "I am new", - }, - }, - }, - ExpectedResult: true, - DeviceMapAfterMethodCall: map[string]struct{}{ - "1": {}, - "2": {}, - "3": {}, - "I am new": {}, - }, - }, - { - Name: "Device disappeared", - Device: &NvidiaDevice{devices: map[string]struct{}{ - "1": {}, - "2": {}, - "3": {}, - }}, - AllDevices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - }, - }, - }, - ExpectedResult: true, - DeviceMapAfterMethodCall: map[string]struct{}{ - "1": {}, - "2": {}, - }, - }, - { - Name: "No devices in NvidiaDevice map", - Device: &NvidiaDevice{}, - AllDevices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "3", - }, - }, - }, - ExpectedResult: true, - DeviceMapAfterMethodCall: map[string]struct{}{ - "1": {}, - "2": {}, - "3": {}, - }, - }, - { - Name: "No devices detected", - Device: &NvidiaDevice{devices: map[string]struct{}{ - "1": {}, - "2": {}, - "3": {}, - }}, - AllDevices: nil, - ExpectedResult: true, - DeviceMapAfterMethodCall: map[string]struct{}{}, - }, - } { - t.Run(testCase.Name, func(t *testing.T) { - actualResult := testCase.Device.fingerprintChanged(testCase.AllDevices) - req := require.New(t) - // check that function returns valid "updated / not updated" state - req.Equal(testCase.ExpectedResult, actualResult) - // check that function propely updates devices map - req.Equal(testCase.Device.devices, testCase.DeviceMapAfterMethodCall) - }) - } -} - -func TestAttributesFromFingerprintDeviceData(t *testing.T) { - for _, testCase := range []struct { - Name string - FingerprintDeviceData *nvml.FingerprintDeviceData - ExpectedResult map[string]*structs.Attribute - }{ - { - Name: "All attributes are not nil", - FingerprintDeviceData: &nvml.FingerprintDeviceData{ - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Type1"), - MemoryMiB: helper.Uint64ToPtr(256), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - ExpectedResult: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(2), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - }, - }, - { - Name: "nil values are omitted", - FingerprintDeviceData: &nvml.FingerprintDeviceData{ - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Type1"), - MemoryMiB: nil, - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - ExpectedResult: map[string]*structs.Attribute{ - PowerAttr: { - Int: helper.Int64ToPtr(2), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - }, - }, - } { - t.Run(testCase.Name, func(t *testing.T) { - actualResult := attributesFromFingerprintDeviceData(testCase.FingerprintDeviceData) - require.Equal(t, testCase.ExpectedResult, actualResult) - }) - } -} - -func TestDeviceGroupFromFingerprintData(t *testing.T) { - for _, testCase := range []struct { - Name string - GroupName string - Devices []*nvml.FingerprintDeviceData - CommonAttributes map[string]*structs.Attribute - ExpectedResult *device.DeviceGroup - }{ - { - Name: "Devices are provided", - GroupName: "Type1", - Devices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Type1"), - MemoryMiB: helper.Uint64ToPtr(100), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - DeviceName: helper.StringToPtr("Type1"), - MemoryMiB: helper.Uint64ToPtr(100), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID2", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - ExpectedResult: &device.DeviceGroup{ - Vendor: vendor, - Type: deviceType, - Name: "Type1", - Devices: []*device.Device{ - { - ID: "1", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID1", - }, - }, - { - ID: "2", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID2", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(2), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - }, - }, - }, - { - Name: "Devices and common attributes are provided", - GroupName: "Type1", - Devices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Type1"), - MemoryMiB: helper.Uint64ToPtr(100), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - DeviceName: helper.StringToPtr("Type1"), - MemoryMiB: helper.Uint64ToPtr(100), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID2", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - CommonAttributes: map[string]*structs.Attribute{ - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - ExpectedResult: &device.DeviceGroup{ - Vendor: vendor, - Type: deviceType, - Name: "Type1", - Devices: []*device.Device{ - { - ID: "1", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID1", - }, - }, - { - ID: "2", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID2", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(2), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - }, - { - Name: "Devices are not provided", - GroupName: "Type1", - CommonAttributes: map[string]*structs.Attribute{ - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - Devices: nil, - ExpectedResult: nil, - }, - } { - t.Run(testCase.Name, func(t *testing.T) { - actualResult := deviceGroupFromFingerprintData(testCase.GroupName, testCase.Devices, testCase.CommonAttributes) - require.New(t).Equal(testCase.ExpectedResult, actualResult) - }) - } -} - -func TestWriteFingerprintToChannel(t *testing.T) { - for _, testCase := range []struct { - Name string - Device *NvidiaDevice - ExpectedWriteToChannel *device.FingerprintResponse - }{ - { - Name: "Check that FingerprintError is handled properly", - Device: &NvidiaDevice{ - nvmlClient: &MockNvmlClient{ - FingerprintError: errors.New(""), - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.FingerprintResponse{ - Error: errors.New(""), - }, - }, - { - Name: "Check ignore devices works correctly", - Device: &NvidiaDevice{ - nvmlClient: &MockNvmlClient{ - FingerprintResponseReturned: &nvml.FingerprintData{ - DriverVersion: "1", - Devices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Name"), - MemoryMiB: helper.Uint64ToPtr(10), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - DeviceName: helper.StringToPtr("Name"), - MemoryMiB: helper.Uint64ToPtr(10), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID2", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - }, - }, - ignoredGPUIDs: map[string]struct{}{ - "1": {}, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.FingerprintResponse{ - Devices: []*device.DeviceGroup{ - { - Vendor: vendor, - Type: deviceType, - Name: "Name", - Devices: []*device.Device{ - { - ID: "2", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID2", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(10), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - }, - }, - }, - { - Name: "Check devices are split to multiple device groups 1", - Device: &NvidiaDevice{ - nvmlClient: &MockNvmlClient{ - FingerprintResponseReturned: &nvml.FingerprintData{ - DriverVersion: "1", - Devices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - DeviceName: helper.StringToPtr("Name2"), - MemoryMiB: helper.Uint64ToPtr(11), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID2", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "3", - DeviceName: helper.StringToPtr("Name3"), - MemoryMiB: helper.Uint64ToPtr(12), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID3", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - }, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.FingerprintResponse{ - Devices: []*device.DeviceGroup{ - { - Vendor: vendor, - Type: deviceType, - Name: "Name1", - Devices: []*device.Device{ - { - ID: "1", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID1", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(10), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - { - Vendor: vendor, - Type: deviceType, - Name: "Name2", - Devices: []*device.Device{ - { - ID: "2", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID2", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(11), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - { - Vendor: vendor, - Type: deviceType, - Name: "Name3", - Devices: []*device.Device{ - { - ID: "3", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID3", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(12), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - }, - }, - }, - { - Name: "Check devices are split to multiple device groups 2", - Device: &NvidiaDevice{ - nvmlClient: &MockNvmlClient{ - FingerprintResponseReturned: &nvml.FingerprintData{ - DriverVersion: "1", - Devices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - DeviceName: helper.StringToPtr("Name2"), - MemoryMiB: helper.Uint64ToPtr(11), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID2", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "3", - DeviceName: helper.StringToPtr("Name2"), - MemoryMiB: helper.Uint64ToPtr(12), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID3", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - }, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.FingerprintResponse{ - Devices: []*device.DeviceGroup{ - { - Vendor: vendor, - Type: deviceType, - Name: "Name1", - Devices: []*device.Device{ - { - ID: "1", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID1", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(10), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - { - Vendor: vendor, - Type: deviceType, - Name: "Name2", - Devices: []*device.Device{ - { - ID: "2", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID2", - }, - }, - { - ID: "3", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID3", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(11), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - }, - }, - }, - } { - t.Run(testCase.Name, func(t *testing.T) { - channel := make(chan *device.FingerprintResponse, 1) - testCase.Device.writeFingerprintToChannel(channel) - actualResult := <-channel - // writeFingerprintToChannel iterates over map keys - // and insterts results to an array, so order of elements in output array - // may be different - // actualResult, expectedResult arrays has to be sorted firsted - sort.Slice(actualResult.Devices, func(i, j int) bool { - return actualResult.Devices[i].Name < actualResult.Devices[j].Name - }) - sort.Slice(testCase.ExpectedWriteToChannel.Devices, func(i, j int) bool { - return testCase.ExpectedWriteToChannel.Devices[i].Name < testCase.ExpectedWriteToChannel.Devices[j].Name - }) - require.Equal(t, testCase.ExpectedWriteToChannel, actualResult) - }) - } -} - -// Test if nonworking driver returns empty fingerprint data -func TestFingerprint(t *testing.T) { - for _, testCase := range []struct { - Name string - Device *NvidiaDevice - ExpectedWriteToChannel *device.FingerprintResponse - }{ - { - Name: "Check that working driver returns valid fingeprint data", - Device: &NvidiaDevice{ - initErr: nil, - nvmlClient: &MockNvmlClient{ - FingerprintResponseReturned: &nvml.FingerprintData{ - DriverVersion: "1", - Devices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID2", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "3", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID3", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - }, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.FingerprintResponse{ - Devices: []*device.DeviceGroup{ - { - Vendor: vendor, - Type: deviceType, - Name: "Name1", - Devices: []*device.Device{ - { - ID: "1", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID1", - }, - }, - { - ID: "2", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID2", - }, - }, - { - ID: "3", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID3", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(10), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - }, - }, - }, - { - Name: "Check that not working driver returns error fingeprint data", - Device: &NvidiaDevice{ - initErr: errors.New("foo"), - nvmlClient: &MockNvmlClient{ - FingerprintResponseReturned: &nvml.FingerprintData{ - DriverVersion: "1", - Devices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "3", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - }, - }, - }, - }, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.FingerprintResponse{ - Error: errors.New("foo"), - }, - }, - } { - t.Run(testCase.Name, func(t *testing.T) { - outCh := make(chan *device.FingerprintResponse) - ctx, cancel := context.WithCancel(context.Background()) - go testCase.Device.fingerprint(ctx, outCh) - result := <-outCh - cancel() - require.New(t).Equal(result, testCase.ExpectedWriteToChannel) - }) - } -} diff --git a/devices/gpu/nvidia/nvml/client.go b/devices/gpu/nvidia/nvml/client.go deleted file mode 100644 index d18dcbe1a9f..00000000000 --- a/devices/gpu/nvidia/nvml/client.go +++ /dev/null @@ -1,194 +0,0 @@ -package nvml - -import ( - "fmt" -) - -// DeviceData represents common fields for Nvidia device -type DeviceData struct { - UUID string - DeviceName *string - MemoryMiB *uint64 - PowerW *uint - BAR1MiB *uint64 -} - -// FingerprintDeviceData is a superset of DeviceData -// it describes device specific fields returned from -// nvml queries during fingerprinting call -type FingerprintDeviceData struct { - *DeviceData - PCIBandwidthMBPerS *uint - CoresClockMHz *uint - MemoryClockMHz *uint - DisplayState string - PersistenceMode string - PCIBusID string -} - -// FingerprintData represets attributes of driver/devices -type FingerprintData struct { - Devices []*FingerprintDeviceData - DriverVersion string -} - -// StatsData is a superset of DeviceData -// it represents statistics data returned for every Nvidia device -type StatsData struct { - *DeviceData - PowerUsageW *uint - GPUUtilization *uint - MemoryUtilization *uint - EncoderUtilization *uint - DecoderUtilization *uint - TemperatureC *uint - UsedMemoryMiB *uint64 - BAR1UsedMiB *uint64 - ECCErrorsL1Cache *uint64 - ECCErrorsL2Cache *uint64 - ECCErrorsDevice *uint64 -} - -// NvmlClient describes how users would use nvml library -type NvmlClient interface { - GetFingerprintData() (*FingerprintData, error) - GetStatsData() ([]*StatsData, error) -} - -// nvmlClient implements NvmlClient -// Users of this lib are expected to use this struct via NewNvmlClient func -type nvmlClient struct { - driver NvmlDriver -} - -// NewNvmlClient function creates new nvmlClient with real -// NvmlDriver implementation. Also, this func initializes NvmlDriver -func NewNvmlClient() (*nvmlClient, error) { - driver := &nvmlDriver{} - err := driver.Initialize() - if err != nil { - return nil, err - } - return &nvmlClient{ - driver: driver, - }, nil -} - -// GetFingerprintData returns FingerprintData for available Nvidia devices -func (c *nvmlClient) GetFingerprintData() (*FingerprintData, error) { - /* - nvml fields to be fingerprinted # nvml_library_call - 1 - Driver Version # nvmlSystemGetDriverVersion - 2 - Product Name # nvmlDeviceGetName - 3 - GPU UUID # nvmlDeviceGetUUID - 4 - Total Memory # nvmlDeviceGetMemoryInfo - 5 - Power # nvmlDeviceGetPowerManagementLimit - 6 - PCIBusID # nvmlDeviceGetPciInfo - 7 - BAR1 Memory # nvmlDeviceGetBAR1MemoryInfo( - 8 - PCI Bandwidth - 9 - Memory, Cores Clock # nvmlDeviceGetMaxClockInfo - 10 - Display Mode # nvmlDeviceGetDisplayMode - 11 - Persistence Mode # nvmlDeviceGetPersistenceMode - */ - - // Assumed that this method is called with receiver retrieved from - // NewNvmlClient - // because this method handles initialization of NVML library - - driverVersion, err := c.driver.SystemDriverVersion() - if err != nil { - return nil, fmt.Errorf("nvidia nvml SystemDriverVersion() error: %v\n", err) - } - - numDevices, err := c.driver.DeviceCount() - if err != nil { - return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err) - } - - allNvidiaGPUResources := make([]*FingerprintDeviceData, numDevices) - - for i := 0; i < int(numDevices); i++ { - deviceInfo, err := c.driver.DeviceInfoByIndex(uint(i)) - if err != nil { - return nil, fmt.Errorf("nvidia nvml DeviceInfoByIndex() error: %v\n", err) - } - - allNvidiaGPUResources[i] = &FingerprintDeviceData{ - DeviceData: &DeviceData{ - DeviceName: deviceInfo.Name, - UUID: deviceInfo.UUID, - MemoryMiB: deviceInfo.MemoryMiB, - PowerW: deviceInfo.PowerW, - BAR1MiB: deviceInfo.BAR1MiB, - }, - PCIBandwidthMBPerS: deviceInfo.PCIBandwidthMBPerS, - CoresClockMHz: deviceInfo.CoresClockMHz, - MemoryClockMHz: deviceInfo.MemoryClockMHz, - DisplayState: deviceInfo.DisplayState, - PersistenceMode: deviceInfo.PersistenceMode, - PCIBusID: deviceInfo.PCIBusID, - } - } - return &FingerprintData{ - Devices: allNvidiaGPUResources, - DriverVersion: driverVersion, - }, nil -} - -// GetStatsData returns statistics data for all devices on this machine -func (c *nvmlClient) GetStatsData() ([]*StatsData, error) { - /* - nvml fields to be reported to stats api # nvml_library_call - 1 - Used Memory # nvmlDeviceGetMemoryInfo - 2 - Utilization of GPU # nvmlDeviceGetUtilizationRates - 3 - Utilization of Memory # nvmlDeviceGetUtilizationRates - 4 - Utilization of Decoder # nvmlDeviceGetDecoderUtilization - 5 - Utilization of Encoder # nvmlDeviceGetEncoderUtilization - 6 - Current GPU Temperature # nvmlDeviceGetTemperature - 7 - Power Draw # nvmlDeviceGetPowerUsage - 8 - BAR1 Used memory # nvmlDeviceGetBAR1MemoryInfo - 9 - ECC Errors on requesting L1Cache # nvmlDeviceGetMemoryErrorCounter - 10 - ECC Errors on requesting L2Cache # nvmlDeviceGetMemoryErrorCounter - 11 - ECC Errors on requesting Device memory # nvmlDeviceGetMemoryErrorCounter - */ - - // Assumed that this method is called with receiver retrieved from - // NewNvmlClient - // because this method handles initialization of NVML library - - numDevices, err := c.driver.DeviceCount() - if err != nil { - return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err) - } - - allNvidiaGPUStats := make([]*StatsData, numDevices) - - for i := 0; i < int(numDevices); i++ { - deviceInfo, deviceStatus, err := c.driver.DeviceInfoAndStatusByIndex(uint(i)) - if err != nil { - return nil, fmt.Errorf("nvidia nvml DeviceInfoAndStatusByIndex() error: %v\n", err) - } - - allNvidiaGPUStats[i] = &StatsData{ - DeviceData: &DeviceData{ - DeviceName: deviceInfo.Name, - UUID: deviceInfo.UUID, - MemoryMiB: deviceInfo.MemoryMiB, - PowerW: deviceInfo.PowerW, - BAR1MiB: deviceInfo.BAR1MiB, - }, - PowerUsageW: deviceStatus.PowerUsageW, - GPUUtilization: deviceStatus.GPUUtilization, - MemoryUtilization: deviceStatus.MemoryUtilization, - EncoderUtilization: deviceStatus.EncoderUtilization, - DecoderUtilization: deviceStatus.DecoderUtilization, - TemperatureC: deviceStatus.TemperatureC, - UsedMemoryMiB: deviceStatus.UsedMemoryMiB, - BAR1UsedMiB: deviceStatus.BAR1UsedMiB, - ECCErrorsL1Cache: deviceStatus.ECCErrorsL1Cache, - ECCErrorsL2Cache: deviceStatus.ECCErrorsL2Cache, - ECCErrorsDevice: deviceStatus.ECCErrorsDevice, - } - } - return allNvidiaGPUStats, nil -} diff --git a/devices/gpu/nvidia/nvml/client_test.go b/devices/gpu/nvidia/nvml/client_test.go deleted file mode 100644 index 23731f7b052..00000000000 --- a/devices/gpu/nvidia/nvml/client_test.go +++ /dev/null @@ -1,399 +0,0 @@ -package nvml - -import ( - "errors" - "testing" - - "github.com/hashicorp/nomad/helper" - "github.com/stretchr/testify/require" -) - -type MockNVMLDriver struct { - systemDriverCallSuccessful bool - deviceCountCallSuccessful bool - deviceInfoByIndexCallSuccessful bool - deviceInfoAndStatusByIndexCallSuccessful bool - driverVersion string - devices []*DeviceInfo - deviceStatus []*DeviceStatus -} - -func (m *MockNVMLDriver) Initialize() error { - return nil -} - -func (m *MockNVMLDriver) Shutdown() error { - return nil -} - -func (m *MockNVMLDriver) SystemDriverVersion() (string, error) { - if !m.systemDriverCallSuccessful { - return "", errors.New("failed to get system driver") - } - return m.driverVersion, nil -} - -func (m *MockNVMLDriver) DeviceCount() (uint, error) { - if !m.deviceCountCallSuccessful { - return 0, errors.New("failed to get device length") - } - return uint(len(m.devices)), nil -} - -func (m *MockNVMLDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) { - if index >= uint(len(m.devices)) { - return nil, errors.New("index is out of range") - } - if !m.deviceInfoByIndexCallSuccessful { - return nil, errors.New("failed to get device info by index") - } - return m.devices[index], nil -} - -func (m *MockNVMLDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) { - if index >= uint(len(m.devices)) || index >= uint(len(m.deviceStatus)) { - return nil, nil, errors.New("index is out of range") - } - if !m.deviceInfoAndStatusByIndexCallSuccessful { - return nil, nil, errors.New("failed to get device info and status by index") - } - return m.devices[index], m.deviceStatus[index], nil -} - -func TestGetFingerprintDataFromNVML(t *testing.T) { - for _, testCase := range []struct { - Name string - DriverConfiguration *MockNVMLDriver - ExpectedError bool - ExpectedResult *FingerprintData - }{ - { - Name: "fail on systemDriverCallSuccessful", - ExpectedError: true, - ExpectedResult: nil, - DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: false, - deviceCountCallSuccessful: true, - deviceInfoByIndexCallSuccessful: true, - }, - }, - { - Name: "fail on deviceCountCallSuccessful", - ExpectedError: true, - ExpectedResult: nil, - DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: true, - deviceCountCallSuccessful: false, - deviceInfoByIndexCallSuccessful: true, - }, - }, - { - Name: "fail on deviceInfoByIndexCall", - ExpectedError: true, - ExpectedResult: nil, - DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: true, - deviceCountCallSuccessful: true, - deviceInfoByIndexCallSuccessful: false, - devices: []*DeviceInfo{ - { - UUID: "UUID1", - Name: helper.StringToPtr("ModelName1"), - MemoryMiB: helper.Uint64ToPtr(16), - PCIBusID: "busId", - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(100), - PCIBandwidthMBPerS: helper.UintToPtr(100), - CoresClockMHz: helper.UintToPtr(100), - MemoryClockMHz: helper.UintToPtr(100), - }, { - UUID: "UUID2", - Name: helper.StringToPtr("ModelName2"), - MemoryMiB: helper.Uint64ToPtr(8), - PCIBusID: "busId", - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(100), - PCIBandwidthMBPerS: helper.UintToPtr(100), - CoresClockMHz: helper.UintToPtr(100), - MemoryClockMHz: helper.UintToPtr(100), - }, - }, - }, - }, - { - Name: "successful outcome", - ExpectedError: false, - ExpectedResult: &FingerprintData{ - DriverVersion: "driverVersion", - Devices: []*FingerprintDeviceData{ - { - DeviceData: &DeviceData{ - DeviceName: helper.StringToPtr("ModelName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(16), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(100), - }, - PCIBusID: "busId1", - PCIBandwidthMBPerS: helper.UintToPtr(100), - CoresClockMHz: helper.UintToPtr(100), - MemoryClockMHz: helper.UintToPtr(100), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, { - DeviceData: &DeviceData{ - DeviceName: helper.StringToPtr("ModelName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(8), - PowerW: helper.UintToPtr(200), - BAR1MiB: helper.Uint64ToPtr(200), - }, - PCIBusID: "busId2", - PCIBandwidthMBPerS: helper.UintToPtr(200), - CoresClockMHz: helper.UintToPtr(200), - MemoryClockMHz: helper.UintToPtr(200), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - }, - DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: true, - deviceCountCallSuccessful: true, - deviceInfoByIndexCallSuccessful: true, - driverVersion: "driverVersion", - devices: []*DeviceInfo{ - { - UUID: "UUID1", - Name: helper.StringToPtr("ModelName1"), - MemoryMiB: helper.Uint64ToPtr(16), - PCIBusID: "busId1", - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(100), - PCIBandwidthMBPerS: helper.UintToPtr(100), - CoresClockMHz: helper.UintToPtr(100), - MemoryClockMHz: helper.UintToPtr(100), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, { - UUID: "UUID2", - Name: helper.StringToPtr("ModelName2"), - MemoryMiB: helper.Uint64ToPtr(8), - PCIBusID: "busId2", - PowerW: helper.UintToPtr(200), - BAR1MiB: helper.Uint64ToPtr(200), - PCIBandwidthMBPerS: helper.UintToPtr(200), - CoresClockMHz: helper.UintToPtr(200), - MemoryClockMHz: helper.UintToPtr(200), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - }, - }, - } { - cli := nvmlClient{driver: testCase.DriverConfiguration} - fingerprintData, err := cli.GetFingerprintData() - if testCase.ExpectedError && err == nil { - t.Errorf("case '%s' : expected Error, but didn't get one", testCase.Name) - } - if !testCase.ExpectedError && err != nil { - t.Errorf("case '%s' : unexpected Error '%v'", testCase.Name, err) - } - require.New(t).Equal(testCase.ExpectedResult, fingerprintData) - } -} - -func TestGetStatsDataFromNVML(t *testing.T) { - for _, testCase := range []struct { - Name string - DriverConfiguration *MockNVMLDriver - ExpectedError bool - ExpectedResult []*StatsData - }{ - { - Name: "fail on deviceCountCallSuccessful", - ExpectedError: true, - ExpectedResult: nil, - DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: true, - deviceCountCallSuccessful: false, - deviceInfoByIndexCallSuccessful: true, - deviceInfoAndStatusByIndexCallSuccessful: true, - }, - }, - { - Name: "fail on DeviceInfoAndStatusByIndex call", - ExpectedError: true, - ExpectedResult: nil, - DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: true, - deviceCountCallSuccessful: true, - deviceInfoAndStatusByIndexCallSuccessful: false, - devices: []*DeviceInfo{ - { - UUID: "UUID1", - Name: helper.StringToPtr("ModelName1"), - MemoryMiB: helper.Uint64ToPtr(16), - PCIBusID: "busId1", - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(100), - PCIBandwidthMBPerS: helper.UintToPtr(100), - CoresClockMHz: helper.UintToPtr(100), - MemoryClockMHz: helper.UintToPtr(100), - }, { - UUID: "UUID2", - Name: helper.StringToPtr("ModelName2"), - MemoryMiB: helper.Uint64ToPtr(8), - PCIBusID: "busId2", - PowerW: helper.UintToPtr(200), - BAR1MiB: helper.Uint64ToPtr(200), - PCIBandwidthMBPerS: helper.UintToPtr(200), - CoresClockMHz: helper.UintToPtr(200), - MemoryClockMHz: helper.UintToPtr(200), - }, - }, - deviceStatus: []*DeviceStatus{ - { - TemperatureC: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(1), - ECCErrorsL2Cache: helper.Uint64ToPtr(1), - ECCErrorsDevice: helper.Uint64ToPtr(1), - PowerUsageW: helper.UintToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - }, - { - TemperatureC: helper.UintToPtr(2), - GPUUtilization: helper.UintToPtr(2), - MemoryUtilization: helper.UintToPtr(2), - EncoderUtilization: helper.UintToPtr(2), - DecoderUtilization: helper.UintToPtr(2), - UsedMemoryMiB: helper.Uint64ToPtr(2), - ECCErrorsL1Cache: helper.Uint64ToPtr(2), - ECCErrorsL2Cache: helper.Uint64ToPtr(2), - ECCErrorsDevice: helper.Uint64ToPtr(2), - PowerUsageW: helper.UintToPtr(2), - BAR1UsedMiB: helper.Uint64ToPtr(2), - }, - }, - }, - }, - { - Name: "successful outcome", - ExpectedError: false, - ExpectedResult: []*StatsData{ - { - DeviceData: &DeviceData{ - DeviceName: helper.StringToPtr("ModelName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(16), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(100), - }, - TemperatureC: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(1), - ECCErrorsL2Cache: helper.Uint64ToPtr(1), - ECCErrorsDevice: helper.Uint64ToPtr(1), - PowerUsageW: helper.UintToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - }, - { - DeviceData: &DeviceData{ - DeviceName: helper.StringToPtr("ModelName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(8), - PowerW: helper.UintToPtr(200), - BAR1MiB: helper.Uint64ToPtr(200), - }, - TemperatureC: helper.UintToPtr(2), - GPUUtilization: helper.UintToPtr(2), - MemoryUtilization: helper.UintToPtr(2), - EncoderUtilization: helper.UintToPtr(2), - DecoderUtilization: helper.UintToPtr(2), - UsedMemoryMiB: helper.Uint64ToPtr(2), - ECCErrorsL1Cache: helper.Uint64ToPtr(2), - ECCErrorsL2Cache: helper.Uint64ToPtr(2), - ECCErrorsDevice: helper.Uint64ToPtr(2), - PowerUsageW: helper.UintToPtr(2), - BAR1UsedMiB: helper.Uint64ToPtr(2), - }, - }, - DriverConfiguration: &MockNVMLDriver{ - deviceCountCallSuccessful: true, - deviceInfoByIndexCallSuccessful: true, - deviceInfoAndStatusByIndexCallSuccessful: true, - devices: []*DeviceInfo{ - { - UUID: "UUID1", - Name: helper.StringToPtr("ModelName1"), - MemoryMiB: helper.Uint64ToPtr(16), - PCIBusID: "busId1", - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(100), - PCIBandwidthMBPerS: helper.UintToPtr(100), - CoresClockMHz: helper.UintToPtr(100), - MemoryClockMHz: helper.UintToPtr(100), - }, { - UUID: "UUID2", - Name: helper.StringToPtr("ModelName2"), - MemoryMiB: helper.Uint64ToPtr(8), - PCIBusID: "busId2", - PowerW: helper.UintToPtr(200), - BAR1MiB: helper.Uint64ToPtr(200), - PCIBandwidthMBPerS: helper.UintToPtr(200), - CoresClockMHz: helper.UintToPtr(200), - MemoryClockMHz: helper.UintToPtr(200), - }, - }, - deviceStatus: []*DeviceStatus{ - { - TemperatureC: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(1), - ECCErrorsL2Cache: helper.Uint64ToPtr(1), - ECCErrorsDevice: helper.Uint64ToPtr(1), - PowerUsageW: helper.UintToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - }, - { - TemperatureC: helper.UintToPtr(2), - GPUUtilization: helper.UintToPtr(2), - MemoryUtilization: helper.UintToPtr(2), - EncoderUtilization: helper.UintToPtr(2), - DecoderUtilization: helper.UintToPtr(2), - UsedMemoryMiB: helper.Uint64ToPtr(2), - ECCErrorsL1Cache: helper.Uint64ToPtr(2), - ECCErrorsL2Cache: helper.Uint64ToPtr(2), - ECCErrorsDevice: helper.Uint64ToPtr(2), - PowerUsageW: helper.UintToPtr(2), - BAR1UsedMiB: helper.Uint64ToPtr(2), - }, - }, - }, - }, - } { - cli := nvmlClient{driver: testCase.DriverConfiguration} - statsData, err := cli.GetStatsData() - if testCase.ExpectedError && err == nil { - t.Errorf("case '%s' : expected Error, but didn't get one", testCase.Name) - } - if !testCase.ExpectedError && err != nil { - t.Errorf("case '%s' : unexpected Error '%v'", testCase.Name, err) - } - require.New(t).Equal(testCase.ExpectedResult, statsData) - } -} diff --git a/devices/gpu/nvidia/nvml/driver_default.go b/devices/gpu/nvidia/nvml/driver_default.go deleted file mode 100644 index e67efa22eea..00000000000 --- a/devices/gpu/nvidia/nvml/driver_default.go +++ /dev/null @@ -1,33 +0,0 @@ -// +build !linux - -package nvml - -// Initialize nvml library by locating nvml shared object file and calling ldopen -func (n *nvmlDriver) Initialize() error { - return UnavailableLib -} - -// Shutdown stops any further interaction with nvml -func (n *nvmlDriver) Shutdown() error { - return UnavailableLib -} - -// SystemDriverVersion returns installed driver version -func (n *nvmlDriver) SystemDriverVersion() (string, error) { - return "", UnavailableLib -} - -// DeviceCount reports number of available GPU devices -func (n *nvmlDriver) DeviceCount() (uint, error) { - return 0, UnavailableLib -} - -// DeviceInfoByIndex returns DeviceInfo for index GPU in system device list -func (n *nvmlDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) { - return nil, UnavailableLib -} - -// DeviceInfoByIndex returns DeviceInfo and DeviceStatus for index GPU in system device list -func (n *nvmlDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) { - return nil, nil, UnavailableLib -} diff --git a/devices/gpu/nvidia/nvml/driver_linux.go b/devices/gpu/nvidia/nvml/driver_linux.go deleted file mode 100644 index bdd777561bc..00000000000 --- a/devices/gpu/nvidia/nvml/driver_linux.go +++ /dev/null @@ -1,85 +0,0 @@ -package nvml - -import ( - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml" -) - -// Initialize nvml library by locating nvml shared object file and calling ldopen -func (n *nvmlDriver) Initialize() error { - return nvml.Init() -} - -// Shutdown stops any further interaction with nvml -func (n *nvmlDriver) Shutdown() error { - return nvml.Shutdown() -} - -// SystemDriverVersion returns installed driver version -func (n *nvmlDriver) SystemDriverVersion() (string, error) { - return nvml.GetDriverVersion() -} - -// DeviceCount reports number of available GPU devices -func (n *nvmlDriver) DeviceCount() (uint, error) { - return nvml.GetDeviceCount() -} - -// DeviceInfoByIndex returns DeviceInfo for index GPU in system device list -func (n *nvmlDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) { - device, err := nvml.NewDevice(index) - if err != nil { - return nil, err - } - deviceMode, err := device.GetDeviceMode() - if err != nil { - return nil, err - } - return &DeviceInfo{ - UUID: device.UUID, - Name: device.Model, - MemoryMiB: device.Memory, - PowerW: device.Power, - BAR1MiB: device.PCI.BAR1, - PCIBandwidthMBPerS: device.PCI.Bandwidth, - PCIBusID: device.PCI.BusID, - CoresClockMHz: device.Clocks.Cores, - MemoryClockMHz: device.Clocks.Memory, - DisplayState: deviceMode.DisplayInfo.Mode.String(), - PersistenceMode: deviceMode.Persistence.String(), - }, nil -} - -// DeviceInfoByIndex returns DeviceInfo and DeviceStatus for index GPU in system device list -func (n *nvmlDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) { - device, err := nvml.NewDevice(index) - if err != nil { - return nil, nil, err - } - status, err := device.Status() - if err != nil { - return nil, nil, err - } - return &DeviceInfo{ - UUID: device.UUID, - Name: device.Model, - MemoryMiB: device.Memory, - PowerW: device.Power, - BAR1MiB: device.PCI.BAR1, - PCIBandwidthMBPerS: device.PCI.Bandwidth, - PCIBusID: device.PCI.BusID, - CoresClockMHz: device.Clocks.Cores, - MemoryClockMHz: device.Clocks.Memory, - }, &DeviceStatus{ - TemperatureC: status.Temperature, - GPUUtilization: status.Utilization.GPU, - MemoryUtilization: status.Utilization.Memory, - EncoderUtilization: status.Utilization.Encoder, - DecoderUtilization: status.Utilization.Decoder, - UsedMemoryMiB: status.Memory.Global.Used, - ECCErrorsL1Cache: status.Memory.ECCErrors.L1Cache, - ECCErrorsL2Cache: status.Memory.ECCErrors.L2Cache, - ECCErrorsDevice: status.Memory.ECCErrors.Device, - PowerUsageW: status.Power, - BAR1UsedMiB: status.PCI.BAR1Used, - }, nil -} diff --git a/devices/gpu/nvidia/nvml/shared.go b/devices/gpu/nvidia/nvml/shared.go deleted file mode 100644 index a0bb04d2223..00000000000 --- a/devices/gpu/nvidia/nvml/shared.go +++ /dev/null @@ -1,61 +0,0 @@ -package nvml - -import "errors" - -var ( - // UnavailableLib is returned when the nvml library could not be loaded. - UnavailableLib = errors.New("could not load NVML library") -) - -// nvmlDriver implements NvmlDriver -// Users are required to call Initialize method before using any other methods -type nvmlDriver struct{} - -// NvmlDriver represents set of methods to query nvml library -type NvmlDriver interface { - Initialize() error - Shutdown() error - SystemDriverVersion() (string, error) - DeviceCount() (uint, error) - DeviceInfoByIndex(uint) (*DeviceInfo, error) - DeviceInfoAndStatusByIndex(uint) (*DeviceInfo, *DeviceStatus, error) -} - -// DeviceInfo represents nvml device data -// this struct is returned by NvmlDriver DeviceInfoByIndex and -// DeviceInfoAndStatusByIndex methods -type DeviceInfo struct { - // The following fields are guaranteed to be retrieved from nvml - UUID string - PCIBusID string - DisplayState string - PersistenceMode string - - // The following fields can be nil after call to nvml, because nvml was - // not able to retrieve this fields for specific nvidia card - Name *string - MemoryMiB *uint64 - PowerW *uint - BAR1MiB *uint64 - PCIBandwidthMBPerS *uint - CoresClockMHz *uint - MemoryClockMHz *uint -} - -// DeviceStatus represents nvml device status -// this struct is returned by NvmlDriver DeviceInfoAndStatusByIndex method -type DeviceStatus struct { - // The following fields can be nil after call to nvml, because nvml was - // not able to retrieve this fields for specific nvidia card - PowerUsageW *uint - TemperatureC *uint - GPUUtilization *uint // % - MemoryUtilization *uint // % - EncoderUtilization *uint // % - DecoderUtilization *uint // % - BAR1UsedMiB *uint64 - UsedMemoryMiB *uint64 - ECCErrorsL1Cache *uint64 - ECCErrorsL2Cache *uint64 - ECCErrorsDevice *uint64 -} diff --git a/devices/gpu/nvidia/stats.go b/devices/gpu/nvidia/stats.go deleted file mode 100644 index c6c44775791..00000000000 --- a/devices/gpu/nvidia/stats.go +++ /dev/null @@ -1,325 +0,0 @@ -package nvidia - -import ( - "context" - "time" - - "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" - "github.com/hashicorp/nomad/helper" - "github.com/hashicorp/nomad/plugins/device" - "github.com/hashicorp/nomad/plugins/shared/structs" -) - -const ( - // Attribute names for reporting stats output - PowerUsageAttr = "Power usage" - PowerUsageUnit = "W" - PowerUsageDesc = "Power usage for this GPU in watts and " + - "its associated circuitry (e.g. memory) / Maximum GPU Power" - GPUUtilizationAttr = "GPU utilization" - GPUUtilizationUnit = "%" - GPUUtilizationDesc = "Percent of time over the past sample period " + - "during which one or more kernels were executing on the GPU." - MemoryUtilizationAttr = "Memory utilization" - MemoryUtilizationUnit = "%" - MemoryUtilizationDesc = "Percentage of bandwidth used during the past sample period" - EncoderUtilizationAttr = "Encoder utilization" - EncoderUtilizationUnit = "%" - EncoderUtilizationDesc = "Percent of time over the past sample period " + - "during which GPU Encoder was used" - DecoderUtilizationAttr = "Decoder utilization" - DecoderUtilizationUnit = "%" - DecoderUtilizationDesc = "Percent of time over the past sample period " + - "during which GPU Decoder was used" - TemperatureAttr = "Temperature" - TemperatureUnit = "C" // Celsius degrees - TemperatureDesc = "Temperature of the Unit" - MemoryStateAttr = "Memory state" - MemoryStateUnit = "MiB" // Mebibytes - MemoryStateDesc = "UsedMemory / TotalMemory" - BAR1StateAttr = "BAR1 buffer state" - BAR1StateUnit = "MiB" // Mebibytes - BAR1StateDesc = "UsedBAR1 / TotalBAR1" - ECCErrorsL1CacheAttr = "ECC L1 errors" - ECCErrorsL1CacheUnit = "#" // number of errors - ECCErrorsL1CacheDesc = "Requested L1Cache error counter for the device" - ECCErrorsL2CacheAttr = "ECC L2 errors" - ECCErrorsL2CacheUnit = "#" // number of errors - ECCErrorsL2CacheDesc = "Requested L2Cache error counter for the device" - ECCErrorsDeviceAttr = "ECC memory errors" - ECCErrorsDeviceUnit = "#" // number of errors - ECCErrorsDeviceDesc = "Requested memory error counter for the device" -) - -// stats is the long running goroutine that streams device statistics -func (d *NvidiaDevice) stats(ctx context.Context, stats chan<- *device.StatsResponse, interval time.Duration) { - defer close(stats) - - if d.initErr != nil { - if d.initErr.Error() != nvml.UnavailableLib.Error() { - d.logger.Error("exiting stats due to problems with NVML loading", "error", d.initErr) - stats <- device.NewStatsError(d.initErr) - } - - return - } - - // Create a timer that will fire immediately for the first detection - ticker := time.NewTimer(0) - - for { - select { - case <-ctx.Done(): - return - case <-ticker.C: - ticker.Reset(interval) - } - - d.writeStatsToChannel(stats, time.Now()) - } -} - -// filterStatsByID accepts list of StatsData and set of IDs -// this function would return entries from StatsData with IDs found in the set -func filterStatsByID(stats []*nvml.StatsData, ids map[string]struct{}) []*nvml.StatsData { - var filteredStats []*nvml.StatsData - for _, statsItem := range stats { - if _, ok := ids[statsItem.UUID]; ok { - filteredStats = append(filteredStats, statsItem) - } - } - return filteredStats -} - -// writeStatsToChannel collects StatsData from NVML backend, groups StatsData -// by DeviceName attribute, populates DeviceGroupStats structure for every group -// and sends data over provided channel -func (d *NvidiaDevice) writeStatsToChannel(stats chan<- *device.StatsResponse, timestamp time.Time) { - statsData, err := d.nvmlClient.GetStatsData() - if err != nil { - d.logger.Error("failed to get nvidia stats", "error", err) - stats <- &device.StatsResponse{ - Error: err, - } - return - } - - // filter only stats from devices that are stored in NvidiaDevice struct - d.deviceLock.RLock() - statsData = filterStatsByID(statsData, d.devices) - d.deviceLock.RUnlock() - - // group stats by DeviceName struct field - statsListByDeviceName := make(map[string][]*nvml.StatsData) - for _, statsItem := range statsData { - deviceName := statsItem.DeviceName - if deviceName == nil { - // nvml driver was not able to detect device name. This kind - // of devices are placed to single group with 'notAvailable' name - notAvailableCopy := notAvailable - deviceName = ¬AvailableCopy - } - - statsListByDeviceName[*deviceName] = append(statsListByDeviceName[*deviceName], statsItem) - } - - // place data device.DeviceGroupStats struct for every group of stats - deviceGroupsStats := make([]*device.DeviceGroupStats, 0, len(statsListByDeviceName)) - for groupName, groupStats := range statsListByDeviceName { - deviceGroupsStats = append(deviceGroupsStats, statsForGroup(groupName, groupStats, timestamp)) - } - - stats <- &device.StatsResponse{ - Groups: deviceGroupsStats, - } -} - -func newNotAvailableDeviceStats(unit, desc string) *structs.StatValue { - return &structs.StatValue{Unit: unit, Desc: desc, StringVal: helper.StringToPtr(notAvailable)} -} - -// statsForGroup is a helper function that populates device.DeviceGroupStats -// for given groupName with groupStats list -func statsForGroup(groupName string, groupStats []*nvml.StatsData, timestamp time.Time) *device.DeviceGroupStats { - instanceStats := make(map[string]*device.DeviceStats) - for _, statsItem := range groupStats { - instanceStats[statsItem.UUID] = statsForItem(statsItem, timestamp) - } - - return &device.DeviceGroupStats{ - Vendor: vendor, - Type: deviceType, - Name: groupName, - InstanceStats: instanceStats, - } -} - -// statsForItem is a helper function that populates device.DeviceStats for given -// nvml.StatsData -func statsForItem(statsItem *nvml.StatsData, timestamp time.Time) *device.DeviceStats { - // nvml.StatsData holds pointers to values that can be nil - // In case they are nil return stats with 'notAvailable' constant - var ( - powerUsageStat *structs.StatValue - GPUUtilizationStat *structs.StatValue - memoryUtilizationStat *structs.StatValue - encoderUtilizationStat *structs.StatValue - decoderUtilizationStat *structs.StatValue - temperatureStat *structs.StatValue - memoryStateStat *structs.StatValue - BAR1StateStat *structs.StatValue - ECCErrorsL1CacheStat *structs.StatValue - ECCErrorsL2CacheStat *structs.StatValue - ECCErrorsDeviceStat *structs.StatValue - ) - - if statsItem.PowerUsageW == nil || statsItem.PowerW == nil { - powerUsageStat = newNotAvailableDeviceStats(PowerUsageUnit, PowerUsageDesc) - } else { - powerUsageStat = &structs.StatValue{ - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(int64(*statsItem.PowerUsageW)), - IntDenominatorVal: uintToInt64Ptr(statsItem.PowerW), - } - } - - if statsItem.GPUUtilization == nil { - GPUUtilizationStat = newNotAvailableDeviceStats(GPUUtilizationUnit, GPUUtilizationDesc) - } else { - GPUUtilizationStat = &structs.StatValue{ - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: uintToInt64Ptr(statsItem.GPUUtilization), - } - } - - if statsItem.MemoryUtilization == nil { - memoryUtilizationStat = newNotAvailableDeviceStats(MemoryUtilizationUnit, MemoryUtilizationDesc) - } else { - memoryUtilizationStat = &structs.StatValue{ - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: uintToInt64Ptr(statsItem.MemoryUtilization), - } - } - - if statsItem.EncoderUtilization == nil { - encoderUtilizationStat = newNotAvailableDeviceStats(EncoderUtilizationUnit, EncoderUtilizationDesc) - } else { - encoderUtilizationStat = &structs.StatValue{ - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: uintToInt64Ptr(statsItem.EncoderUtilization), - } - } - - if statsItem.DecoderUtilization == nil { - decoderUtilizationStat = newNotAvailableDeviceStats(DecoderUtilizationUnit, DecoderUtilizationDesc) - } else { - decoderUtilizationStat = &structs.StatValue{ - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: uintToInt64Ptr(statsItem.DecoderUtilization), - } - } - - if statsItem.TemperatureC == nil { - temperatureStat = newNotAvailableDeviceStats(TemperatureUnit, TemperatureDesc) - } else { - temperatureStat = &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: uintToInt64Ptr(statsItem.TemperatureC), - } - } - - if statsItem.UsedMemoryMiB == nil || statsItem.MemoryMiB == nil { - memoryStateStat = newNotAvailableDeviceStats(MemoryStateUnit, MemoryStateDesc) - } else { - memoryStateStat = &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: uint64ToInt64Ptr(statsItem.UsedMemoryMiB), - IntDenominatorVal: uint64ToInt64Ptr(statsItem.MemoryMiB), - } - } - - if statsItem.BAR1UsedMiB == nil || statsItem.BAR1MiB == nil { - BAR1StateStat = newNotAvailableDeviceStats(BAR1StateUnit, BAR1StateDesc) - } else { - BAR1StateStat = &structs.StatValue{ - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: uint64ToInt64Ptr(statsItem.BAR1UsedMiB), - IntDenominatorVal: uint64ToInt64Ptr(statsItem.BAR1MiB), - } - } - - if statsItem.ECCErrorsL1Cache == nil { - ECCErrorsL1CacheStat = newNotAvailableDeviceStats(ECCErrorsL1CacheUnit, ECCErrorsL1CacheDesc) - } else { - ECCErrorsL1CacheStat = &structs.StatValue{ - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: uint64ToInt64Ptr(statsItem.ECCErrorsL1Cache), - } - } - - if statsItem.ECCErrorsL2Cache == nil { - ECCErrorsL2CacheStat = newNotAvailableDeviceStats(ECCErrorsL2CacheUnit, ECCErrorsL2CacheDesc) - } else { - ECCErrorsL2CacheStat = &structs.StatValue{ - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: uint64ToInt64Ptr(statsItem.ECCErrorsL2Cache), - } - } - - if statsItem.ECCErrorsDevice == nil { - ECCErrorsDeviceStat = newNotAvailableDeviceStats(ECCErrorsDeviceUnit, ECCErrorsDeviceDesc) - } else { - ECCErrorsDeviceStat = &structs.StatValue{ - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: uint64ToInt64Ptr(statsItem.ECCErrorsDevice), - } - } - return &device.DeviceStats{ - Summary: memoryStateStat, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: powerUsageStat, - GPUUtilizationAttr: GPUUtilizationStat, - MemoryUtilizationAttr: memoryUtilizationStat, - EncoderUtilizationAttr: encoderUtilizationStat, - DecoderUtilizationAttr: decoderUtilizationStat, - TemperatureAttr: temperatureStat, - MemoryStateAttr: memoryStateStat, - BAR1StateAttr: BAR1StateStat, - ECCErrorsL1CacheAttr: ECCErrorsL1CacheStat, - ECCErrorsL2CacheAttr: ECCErrorsL2CacheStat, - ECCErrorsDeviceAttr: ECCErrorsDeviceStat, - }, - }, - Timestamp: timestamp, - } -} - -func uintToInt64Ptr(u *uint) *int64 { - if u == nil { - return nil - } - - v := int64(*u) - return &v -} - -func uint64ToInt64Ptr(u *uint64) *int64 { - if u == nil { - return nil - } - - v := int64(*u) - return &v -} diff --git a/devices/gpu/nvidia/stats_test.go b/devices/gpu/nvidia/stats_test.go deleted file mode 100644 index f6221e0f480..00000000000 --- a/devices/gpu/nvidia/stats_test.go +++ /dev/null @@ -1,3041 +0,0 @@ -package nvidia - -import ( - "errors" - "sort" - "testing" - "time" - - hclog "github.com/hashicorp/go-hclog" - "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" - "github.com/hashicorp/nomad/helper" - "github.com/hashicorp/nomad/plugins/device" - "github.com/hashicorp/nomad/plugins/shared/structs" - "github.com/stretchr/testify/require" -) - -func TestFilterStatsByID(t *testing.T) { - for _, testCase := range []struct { - Name string - ProvidedStats []*nvml.StatsData - ProvidedIDs map[string]struct{} - ExpectedResult []*nvml.StatsData - }{ - { - Name: "All ids are in the map", - ProvidedStats: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - }, - ProvidedIDs: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - ExpectedResult: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - }, - }, - { - Name: "Odd are not provided in the map", - ProvidedStats: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - }, - ProvidedIDs: map[string]struct{}{ - "UUID2": {}, - }, - ExpectedResult: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - }, - }, - { - Name: "Even are not provided in the map", - ProvidedStats: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - }, - ProvidedIDs: map[string]struct{}{ - "UUID1": {}, - "UUID3": {}, - }, - ExpectedResult: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - }, - }, - { - Name: "No Stats were provided", - ProvidedIDs: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - }, - { - Name: "No Ids were provided", - ProvidedStats: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - }, - }, - } { - actualResult := filterStatsByID(testCase.ProvidedStats, testCase.ProvidedIDs) - require.New(t).Equal(testCase.ExpectedResult, actualResult) - } -} - -func TestStatsForItem(t *testing.T) { - for _, testCase := range []struct { - Name string - Timestamp time.Time - ItemStat *nvml.StatsData - ExpectedResult *device.DeviceStats - }{ - { - Name: "All fields in ItemStat are not nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "Power usage is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: nil, - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "PowerW is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: nil, - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "GPUUtilization is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: nil, - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "MemoryUtilization is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: nil, - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "EncoderUtilization is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: nil, - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "DecoderUtilization is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: nil, - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "Temperature is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: nil, - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "UsedMemoryMiB is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: nil, - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "MemoryMiB is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: nil, - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "BAR1UsedMiB is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: nil, - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "BAR1MiB is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: nil, - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "ECCErrorsL1Cache is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: nil, - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "ECCErrorsL2Cache is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: nil, - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "ECCErrorsDevice is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: nil, - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - } { - actualResult := statsForItem(testCase.ItemStat, testCase.Timestamp) - require.New(t).Equal(testCase.ExpectedResult, actualResult) - } -} - -func TestStatsForGroup(t *testing.T) { - for _, testCase := range []struct { - Name string - Timestamp time.Time - GroupStats []*nvml.StatsData - GroupName string - ExpectedResult *device.DeviceGroupStats - }{ - { - Name: "make sure that all data is transformed correctly", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - GroupName: "DeviceName1", - GroupStats: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName2"), - MemoryMiB: helper.Uint64ToPtr(2), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(2), - GPUUtilization: helper.UintToPtr(2), - MemoryUtilization: helper.UintToPtr(2), - EncoderUtilization: helper.UintToPtr(2), - DecoderUtilization: helper.UintToPtr(2), - TemperatureC: helper.UintToPtr(2), - UsedMemoryMiB: helper.Uint64ToPtr(2), - BAR1UsedMiB: helper.Uint64ToPtr(2), - ECCErrorsL1Cache: helper.Uint64ToPtr(200), - ECCErrorsL2Cache: helper.Uint64ToPtr(200), - ECCErrorsDevice: helper.Uint64ToPtr(200), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName3"), - MemoryMiB: helper.Uint64ToPtr(3), - PowerW: helper.UintToPtr(3), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(3), - GPUUtilization: helper.UintToPtr(3), - MemoryUtilization: helper.UintToPtr(3), - EncoderUtilization: helper.UintToPtr(3), - DecoderUtilization: helper.UintToPtr(3), - TemperatureC: helper.UintToPtr(3), - UsedMemoryMiB: helper.Uint64ToPtr(3), - BAR1UsedMiB: helper.Uint64ToPtr(3), - ECCErrorsL1Cache: helper.Uint64ToPtr(300), - ECCErrorsL2Cache: helper.Uint64ToPtr(300), - ECCErrorsDevice: helper.Uint64ToPtr(300), - }, - }, - ExpectedResult: &device.DeviceGroupStats{ - Vendor: vendor, - Type: deviceType, - Name: "DeviceName1", - InstanceStats: map[string]*device.DeviceStats{ - "UUID1": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - "UUID2": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - "UUID3": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - }, - } { - actualResult := statsForGroup(testCase.GroupName, testCase.GroupStats, testCase.Timestamp) - require.New(t).Equal(testCase.ExpectedResult, actualResult) - } -} - -func TestWriteStatsToChannel(t *testing.T) { - for _, testCase := range []struct { - Name string - ExpectedWriteToChannel *device.StatsResponse - Timestamp time.Time - Device *NvidiaDevice - }{ - { - Name: "NVML wrapper returns error", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ExpectedWriteToChannel: &device.StatsResponse{ - Error: errors.New(""), - }, - Device: &NvidiaDevice{ - nvmlClient: &MockNvmlClient{ - StatsError: errors.New(""), - }, - logger: hclog.NewNullLogger(), - }, - }, - { - Name: "Check that stats with multiple DeviceNames are assigned to different groups", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - Device: &NvidiaDevice{ - devices: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - nvmlClient: &MockNvmlClient{ - StatsResponseReturned: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName2"), - MemoryMiB: helper.Uint64ToPtr(2), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(2), - GPUUtilization: helper.UintToPtr(2), - MemoryUtilization: helper.UintToPtr(2), - EncoderUtilization: helper.UintToPtr(2), - DecoderUtilization: helper.UintToPtr(2), - TemperatureC: helper.UintToPtr(2), - UsedMemoryMiB: helper.Uint64ToPtr(2), - BAR1UsedMiB: helper.Uint64ToPtr(2), - ECCErrorsL1Cache: helper.Uint64ToPtr(200), - ECCErrorsL2Cache: helper.Uint64ToPtr(200), - ECCErrorsDevice: helper.Uint64ToPtr(200), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName3"), - MemoryMiB: helper.Uint64ToPtr(3), - PowerW: helper.UintToPtr(3), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(3), - GPUUtilization: helper.UintToPtr(3), - MemoryUtilization: helper.UintToPtr(3), - EncoderUtilization: helper.UintToPtr(3), - DecoderUtilization: helper.UintToPtr(3), - TemperatureC: helper.UintToPtr(3), - UsedMemoryMiB: helper.Uint64ToPtr(3), - BAR1UsedMiB: helper.Uint64ToPtr(3), - ECCErrorsL1Cache: helper.Uint64ToPtr(300), - ECCErrorsL2Cache: helper.Uint64ToPtr(300), - ECCErrorsDevice: helper.Uint64ToPtr(300), - }, - }, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.StatsResponse{ - Groups: []*device.DeviceGroupStats{ - { - Vendor: vendor, - Type: deviceType, - Name: "DeviceName1", - InstanceStats: map[string]*device.DeviceStats{ - "UUID1": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - { - Vendor: vendor, - Type: deviceType, - Name: "DeviceName2", - InstanceStats: map[string]*device.DeviceStats{ - "UUID2": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - { - Vendor: vendor, - Type: deviceType, - Name: "DeviceName3", - InstanceStats: map[string]*device.DeviceStats{ - "UUID3": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - }, - }, - }, - { - Name: "Check that stats with multiple DeviceNames are assigned to different groups 2", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - Device: &NvidiaDevice{ - devices: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - nvmlClient: &MockNvmlClient{ - StatsResponseReturned: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName2"), - MemoryMiB: helper.Uint64ToPtr(2), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(2), - GPUUtilization: helper.UintToPtr(2), - MemoryUtilization: helper.UintToPtr(2), - EncoderUtilization: helper.UintToPtr(2), - DecoderUtilization: helper.UintToPtr(2), - TemperatureC: helper.UintToPtr(2), - UsedMemoryMiB: helper.Uint64ToPtr(2), - BAR1UsedMiB: helper.Uint64ToPtr(2), - ECCErrorsL1Cache: helper.Uint64ToPtr(200), - ECCErrorsL2Cache: helper.Uint64ToPtr(200), - ECCErrorsDevice: helper.Uint64ToPtr(200), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName2"), - MemoryMiB: helper.Uint64ToPtr(3), - PowerW: helper.UintToPtr(3), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(3), - GPUUtilization: helper.UintToPtr(3), - MemoryUtilization: helper.UintToPtr(3), - EncoderUtilization: helper.UintToPtr(3), - DecoderUtilization: helper.UintToPtr(3), - TemperatureC: helper.UintToPtr(3), - UsedMemoryMiB: helper.Uint64ToPtr(3), - BAR1UsedMiB: helper.Uint64ToPtr(3), - ECCErrorsL1Cache: helper.Uint64ToPtr(300), - ECCErrorsL2Cache: helper.Uint64ToPtr(300), - ECCErrorsDevice: helper.Uint64ToPtr(300), - }, - }, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.StatsResponse{ - Groups: []*device.DeviceGroupStats{ - { - Vendor: vendor, - Type: deviceType, - Name: "DeviceName1", - InstanceStats: map[string]*device.DeviceStats{ - "UUID1": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - { - Vendor: vendor, - Type: deviceType, - Name: "DeviceName2", - InstanceStats: map[string]*device.DeviceStats{ - "UUID3": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - "UUID2": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - }, - }, - }, - { - Name: "Check that only devices from NvidiaDevice.device map stats are reported", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - Device: &NvidiaDevice{ - devices: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - }, - nvmlClient: &MockNvmlClient{ - StatsResponseReturned: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName2"), - MemoryMiB: helper.Uint64ToPtr(2), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(2), - GPUUtilization: helper.UintToPtr(2), - MemoryUtilization: helper.UintToPtr(2), - EncoderUtilization: helper.UintToPtr(2), - DecoderUtilization: helper.UintToPtr(2), - TemperatureC: helper.UintToPtr(2), - UsedMemoryMiB: helper.Uint64ToPtr(2), - BAR1UsedMiB: helper.Uint64ToPtr(2), - ECCErrorsL1Cache: helper.Uint64ToPtr(200), - ECCErrorsL2Cache: helper.Uint64ToPtr(200), - ECCErrorsDevice: helper.Uint64ToPtr(200), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName3"), - MemoryMiB: helper.Uint64ToPtr(3), - PowerW: helper.UintToPtr(3), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(3), - GPUUtilization: helper.UintToPtr(3), - MemoryUtilization: helper.UintToPtr(3), - EncoderUtilization: helper.UintToPtr(3), - DecoderUtilization: helper.UintToPtr(3), - TemperatureC: helper.UintToPtr(3), - UsedMemoryMiB: helper.Uint64ToPtr(3), - BAR1UsedMiB: helper.Uint64ToPtr(3), - ECCErrorsL1Cache: helper.Uint64ToPtr(300), - ECCErrorsL2Cache: helper.Uint64ToPtr(300), - ECCErrorsDevice: helper.Uint64ToPtr(300), - }, - }, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.StatsResponse{ - Groups: []*device.DeviceGroupStats{ - { - Vendor: vendor, - Type: deviceType, - Name: "DeviceName1", - InstanceStats: map[string]*device.DeviceStats{ - "UUID1": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - { - Vendor: vendor, - Type: deviceType, - Name: "DeviceName2", - InstanceStats: map[string]*device.DeviceStats{ - "UUID2": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - }, - }, - }, - } { - channel := make(chan *device.StatsResponse, 1) - testCase.Device.writeStatsToChannel(channel, testCase.Timestamp) - actualResult := <-channel - // writeStatsToChannel iterates over map keys - // and insterts results to an array, so order of elements in output array - // may be different - // actualResult, expectedWriteToChannel arrays has to be sorted firsted - sort.Slice(actualResult.Groups, func(i, j int) bool { - return actualResult.Groups[i].Name < actualResult.Groups[j].Name - }) - sort.Slice(testCase.ExpectedWriteToChannel.Groups, func(i, j int) bool { - return testCase.ExpectedWriteToChannel.Groups[i].Name < testCase.ExpectedWriteToChannel.Groups[j].Name - }) - require.New(t).Equal(testCase.ExpectedWriteToChannel, actualResult) - } -} diff --git a/go.mod b/go.mod index 53acd670a3a..9b222104e20 100644 --- a/go.mod +++ b/go.mod @@ -19,7 +19,6 @@ require ( github.com/Azure/go-autorest/autorest/azure/auth v0.5.1 // indirect github.com/LK4D4/joincontext v0.0.0-20171026170139-1724345da6d5 github.com/Microsoft/go-winio v0.4.15-0.20200113171025-3fe6c5262873 - github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20180829222009-86f2a9fac6c5 github.com/NYTimes/gziphandler v1.0.1 github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e github.com/armon/go-metrics v0.3.4 @@ -32,6 +31,7 @@ require ( github.com/coreos/go-iptables v0.4.3-0.20190724151750-969b135e941d github.com/coreos/go-semver v0.3.0 github.com/cyphar/filepath-securejoin v0.2.3-0.20190205144030-7efe413b52e1 // indirect + github.com/denverdino/aliyungo v0.0.0-20190125010748-a747050bb1ba // indirect github.com/docker/cli v0.0.0-20200303215952-eb310fca4956 github.com/docker/distribution v2.7.1+incompatible github.com/docker/docker v17.12.0-ce-rc1.0.20200330121334-7f8b4b621b5d+incompatible diff --git a/go.sum b/go.sum index 7f668d61233..f9ee95cd57a 100644 --- a/go.sum +++ b/go.sum @@ -64,8 +64,6 @@ github.com/LK4D4/joincontext v0.0.0-20171026170139-1724345da6d5/go.mod h1:nxQPcN github.com/Microsoft/hcsshim v0.8.7/go.mod h1:OHd7sQqRFrYd3RmSgbgji+ctCwkbq2wbEYNSzOYtcBQ= github.com/Microsoft/hcsshim v0.8.9 h1:VrfodqvztU8YSOvygU+DN1BGaSGxmrNfqOv5oOuX2Bk= github.com/Microsoft/hcsshim v0.8.9/go.mod h1:5692vkUqntj1idxauYlpoINNKeqCiG6Sg38RRsjT5y8= -github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20180829222009-86f2a9fac6c5 h1:WLyvLAM0QfjAarRzRTG9EgT5McqGWNZMvqqSUSoyUUY= -github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20180829222009-86f2a9fac6c5/go.mod h1:nMOvShGpWaf0bXwXmeu4k+O4uziuaEI8pWzIj3BUrOA= github.com/NYTimes/gziphandler v1.0.0 h1:OswZCvpiFsNRCbeapdJxDuikAqVXTgV7XAht8S9olZo= github.com/NYTimes/gziphandler v1.0.0/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ= github.com/PuerkitoBio/purell v1.0.0/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= @@ -182,8 +180,9 @@ github.com/cyphar/filepath-securejoin v0.2.3-0.20190205144030-7efe413b52e1/go.mo github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/denverdino/aliyungo v0.0.0-20170926055100-d3308649c661 h1:lrWnAyy/F72MbxIxFUzKmcMCdt9Oi8RzpAxzTNQHD7o= github.com/denverdino/aliyungo v0.0.0-20170926055100-d3308649c661/go.mod h1:dV8lFg6daOBZbT6/BDGIz6Y3WFGn8juu6G+CQ6LHtl0= +github.com/denverdino/aliyungo v0.0.0-20190125010748-a747050bb1ba h1:p6poVbjHDkKa+wtC8frBMwQtT3BmqGYBjzMwJ63tuR4= +github.com/denverdino/aliyungo v0.0.0-20190125010748-a747050bb1ba/go.mod h1:dV8lFg6daOBZbT6/BDGIz6Y3WFGn8juu6G+CQ6LHtl0= github.com/dgrijalva/jwt-go v3.2.0+incompatible h1:7qlOGliEKZXTDg6OTjfoBKDXWrumCAMpl/TFQ4/5kLM= github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ= github.com/digitalocean/godo v1.7.5/go.mod h1:h6faOIcZ8lWIwNQ+DN7b3CgX4Kwby5T+nbpNqkUIozU= diff --git a/helper/pluginutils/catalog/register_nvidia_linux.go b/helper/pluginutils/catalog/register_nvidia_linux.go deleted file mode 100644 index a50cbe833a7..00000000000 --- a/helper/pluginutils/catalog/register_nvidia_linux.go +++ /dev/null @@ -1,14 +0,0 @@ -// +build !nonvidia - -package catalog - -import ( - "github.com/hashicorp/nomad/devices/gpu/nvidia" -) - -// This file is where all builtin plugins should be registered in the catalog. -// Plugins with build restrictions should be placed in the appropriate -// register_XXX.go file. -func init() { - Register(nvidia.PluginID, nvidia.PluginConfig) -} diff --git a/website/content/docs/devices/external/index.mdx b/website/content/docs/devices/external/index.mdx index 1a13cc2882e..76ae4d726a9 100644 --- a/website/content/docs/devices/external/index.mdx +++ b/website/content/docs/devices/external/index.mdx @@ -1,30 +1,29 @@ --- layout: docs -page_title: 'Device Plugins: Community Supported' -description: A list of community supported Device Plugins. +page_title: 'Device Plugins: External' +description: 'A list of external Device Plugins.' --- -# Community Supported - -If you have authored a device plugin that you believe will be useful to the -broader Nomad community and you are committed to maintaining the plugin, please -file a PR to add your plugin to this page. - -## Device Plugins +# External Device Plugins Nomad has a plugin system for defining task drivers. External device driver plugins will have the same user experience as built in devices. -Below is a list of community-supported task drivers you can use with Nomad: +Below is a list of official external task drivers you can use with Nomad: -- [USB][usb] +- [Nvidia][nvidia] -## Authoring Device Plugins +## Community Supported -Nomad has a plugin system for defining device drivers. External device plugins -will have the same user experience as built in drivers. For details on -authoring a device plugin, please refer to the [plugin authoring -guide][plugin_guide]. +If you have authored a device plugin that you believe will be useful to the +broader Nomad community and you are committed to maintaining the plugin, +please file a PR to add your plugin to this page. For details on authoring a +device plugin, please refer to the [plugin authoring guide][plugin_guide]. + +Below is a list of community-support task drivers you can use with Nomad: + +- [USB][usb] [plugin_guide]: /docs/internals/plugins +[nvidia]: /docs/devices/external/nvidia [usb]: /docs/devices/external/usb diff --git a/website/content/docs/devices/nvidia.mdx b/website/content/docs/devices/external/nvidia.mdx similarity index 100% rename from website/content/docs/devices/nvidia.mdx rename to website/content/docs/devices/external/nvidia.mdx diff --git a/website/content/docs/devices/index.mdx b/website/content/docs/devices/index.mdx index a342d8f4be0..28c5e58893e 100644 --- a/website/content/docs/devices/index.mdx +++ b/website/content/docs/devices/index.mdx @@ -6,18 +6,13 @@ description: Device Plugins are used to expose devices to tasks in Nomad. # Device Plugins -Device plugins are used to detect and make devices available to tasks in Nomad. -Devices are physical hardware that exists on a node such as a GPU or an FPGA. By -having extensible device plugins, Nomad has the flexibility to support a broad -set of devices and allows the community to build additional device plugins as -needed. +Device plugins are used to detect and make devices available to tasks in +Nomad. Devices are physical hardware that exists on a client node such as a +GPU or an FPGA. By having extensible device plugins, Nomad has the flexibility +to support a broad set of devices and allows the community to build additional +device plugins as needed. The list of supported device plugins is provided on the left of this page. Each device plugin documents its configuration and installation requirements, the attributes it fingerprints, and the environment variables it exposes to tasks. - -For details on authoring a device plugin, please refer to the [plugin authoring -guide][plugin_guide]. - -[plugin_guide]: /docs/internals/plugins diff --git a/website/data/docs-nav-data.json b/website/data/docs-nav-data.json index bc23e7597d0..b15e7b36e44 100644 --- a/website/data/docs-nav-data.json +++ b/website/data/docs-nav-data.json @@ -1442,16 +1442,16 @@ "path": "devices" }, { - "title": "Nvidia", - "path": "devices/nvidia" - }, - { - "title": "Community", + "title": "External", "routes": [ { "title": "Overview", "path": "devices/external" }, + { + "title": "Nvidia", + "path": "devices/external/nvidia" + }, { "title": "USB Beta", "path": "devices/external/usb"