From 6476390ee21584e782695af3e2034c5230797e28 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Thu, 2 Jul 2020 11:05:14 -0400 Subject: [PATCH 1/2] nvidia: support disabling the nvidia plugin --- devices/gpu/nvidia/device.go | 26 +++++++++++++++-- devices/gpu/nvidia/device_test.go | 46 ++++++++++++++++++++++++------- 2 files changed, 60 insertions(+), 12 deletions(-) diff --git a/devices/gpu/nvidia/device.go b/devices/gpu/nvidia/device.go index 064161cf5c5..c0abf874396 100644 --- a/devices/gpu/nvidia/device.go +++ b/devices/gpu/nvidia/device.go @@ -28,9 +28,7 @@ const ( // notAvailable value is returned to nomad server in case some properties were // undetected by nvml driver notAvailable = "N/A" -) -const ( // Nvidia-container-runtime environment variable names NvidiaVisibleDevices = "NVIDIA_VISIBLE_DEVICES" ) @@ -59,6 +57,10 @@ var ( // configSpec is the specification of the plugin's configuration configSpec = hclspec.NewObject(map[string]*hclspec.Spec{ + "enabled": hclspec.NewDefault( + hclspec.NewAttr("enabled", "bool", false), + hclspec.NewLiteral("true"), + ), "ignored_gpu_ids": hclspec.NewDefault( hclspec.NewAttr("ignored_gpu_ids", "list(string)", false), hclspec.NewLiteral("[]"), @@ -68,16 +70,22 @@ var ( hclspec.NewLiteral("\"1m\""), ), }) + + errDeviceNotEnabled = fmt.Errorf("Nvidia device is not enabled") ) // Config contains configuration information for the plugin. type Config struct { + Enabled bool `codec:"enabled"` IgnoredGPUIDs []string `codec:"ignored_gpu_ids"` FingerprintPeriod string `codec:"fingerprint_period"` } // NvidiaDevice contains all plugin specific data type NvidiaDevice struct { + // enabled indicates whether the plugin should be enabled + enabled bool + // nvmlClient is used to get data from nvidia nvmlClient nvml.NvmlClient @@ -133,6 +141,8 @@ func (d *NvidiaDevice) SetConfig(cfg *base.Config) error { } } + d.enabled = config.Enabled + for _, ignoredGPUId := range config.IgnoredGPUIDs { d.ignoredGPUIDs[ignoredGPUId] = struct{}{} } @@ -149,6 +159,10 @@ func (d *NvidiaDevice) SetConfig(cfg *base.Config) error { // Fingerprint streams detected devices. If device changes are detected or the // devices health changes, messages will be emitted. func (d *NvidiaDevice) Fingerprint(ctx context.Context) (<-chan *device.FingerprintResponse, error) { + if !d.enabled { + return nil, errDeviceNotEnabled + } + outCh := make(chan *device.FingerprintResponse) go d.fingerprint(ctx, outCh) return outCh, nil @@ -169,6 +183,10 @@ func (d *NvidiaDevice) Reserve(deviceIDs []string) (*device.ContainerReservation if len(deviceIDs) == 0 { return &device.ContainerReservation{}, nil } + if !d.enabled { + return nil, errDeviceNotEnabled + } + // Due to the asynchronous nature of NvidiaPlugin, there is a possibility // of race condition // @@ -202,6 +220,10 @@ func (d *NvidiaDevice) Reserve(deviceIDs []string) (*device.ContainerReservation // Stats streams statistics for the detected devices. func (d *NvidiaDevice) Stats(ctx context.Context, interval time.Duration) (<-chan *device.StatsResponse, error) { + if !d.enabled { + return nil, errDeviceNotEnabled + } + outCh := make(chan *device.StatsResponse) go d.stats(ctx, outCh, interval) return outCh, nil diff --git a/devices/gpu/nvidia/device_test.go b/devices/gpu/nvidia/device_test.go index 717491f2b69..0554a0b4205 100644 --- a/devices/gpu/nvidia/device_test.go +++ b/devices/gpu/nvidia/device_test.go @@ -26,7 +26,7 @@ func (c *MockNvmlClient) GetStatsData() ([]*nvml.StatsData, error) { } func TestReserve(t *testing.T) { - for _, testCase := range []struct { + cases := []struct { Name string ExpectedReservation *device.ContainerReservation ExpectedError error @@ -47,7 +47,8 @@ func TestReserve(t *testing.T) { "UUID3", }, Device: &NvidiaDevice{ - logger: hclog.NewNullLogger(), + logger: hclog.NewNullLogger(), + enabled: true, }, }, { @@ -66,7 +67,8 @@ func TestReserve(t *testing.T) { devices: map[string]struct{}{ "UUID3": {}, }, - logger: hclog.NewNullLogger(), + logger: hclog.NewNullLogger(), + enabled: true, }, }, { @@ -88,7 +90,8 @@ func TestReserve(t *testing.T) { "UUID2": {}, "UUID3": {}, }, - logger: hclog.NewNullLogger(), + logger: hclog.NewNullLogger(), + enabled: true, }, }, { @@ -102,13 +105,36 @@ func TestReserve(t *testing.T) { "UUID2": {}, "UUID3": {}, }, - logger: hclog.NewNullLogger(), + logger: hclog.NewNullLogger(), + enabled: true, }, }, - } { - actualReservation, actualError := testCase.Device.Reserve(testCase.RequestedIDs) - req := require.New(t) - req.Equal(testCase.ExpectedReservation, actualReservation) - req.Equal(testCase.ExpectedError, actualError) + { + Name: "Device is disabled", + ExpectedReservation: nil, + ExpectedError: errDeviceNotEnabled, + RequestedIDs: []string{ + "UUID1", + "UUID2", + "UUID3", + }, + Device: &NvidiaDevice{ + devices: map[string]struct{}{ + "UUID1": {}, + "UUID2": {}, + "UUID3": {}, + }, + logger: hclog.NewNullLogger(), + enabled: false, + }, + }, + } + + for _, c := range cases { + t.Run(c.Name, func(t *testing.T) { + actualReservation, actualError := c.Device.Reserve(c.RequestedIDs) + require.Equal(t, c.ExpectedReservation, actualReservation) + require.Equal(t, c.ExpectedError, actualError) + }) } } From ccf2bf91805d640f874a0aba0f176170aad30a78 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Tue, 21 Jul 2020 09:40:03 -0400 Subject: [PATCH 2/2] log info when plugin is disabled --- client/devicemanager/instance.go | 6 +++++- devices/gpu/nvidia/device.go | 8 +++----- devices/gpu/nvidia/device_test.go | 2 +- plugins/device/device.go | 5 +++++ 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/client/devicemanager/instance.go b/client/devicemanager/instance.go index 3119f7207df..edeeb2e8eb7 100644 --- a/client/devicemanager/instance.go +++ b/client/devicemanager/instance.go @@ -336,7 +336,11 @@ START: // Start fingerprinting fingerprintCh, err := devicePlugin.Fingerprint(i.ctx) - if err != nil { + if err == device.ErrPluginDisabled { + i.logger.Info("fingerprinting failed: plugin is not enabled") + i.handleFingerprintError() + return + } else if err != nil { i.logger.Error("fingerprinting failed", "error", err) i.handleFingerprintError() return diff --git a/devices/gpu/nvidia/device.go b/devices/gpu/nvidia/device.go index c0abf874396..67680dc2a0e 100644 --- a/devices/gpu/nvidia/device.go +++ b/devices/gpu/nvidia/device.go @@ -70,8 +70,6 @@ var ( hclspec.NewLiteral("\"1m\""), ), }) - - errDeviceNotEnabled = fmt.Errorf("Nvidia device is not enabled") ) // Config contains configuration information for the plugin. @@ -160,7 +158,7 @@ func (d *NvidiaDevice) SetConfig(cfg *base.Config) error { // devices health changes, messages will be emitted. func (d *NvidiaDevice) Fingerprint(ctx context.Context) (<-chan *device.FingerprintResponse, error) { if !d.enabled { - return nil, errDeviceNotEnabled + return nil, device.ErrPluginDisabled } outCh := make(chan *device.FingerprintResponse) @@ -184,7 +182,7 @@ func (d *NvidiaDevice) Reserve(deviceIDs []string) (*device.ContainerReservation return &device.ContainerReservation{}, nil } if !d.enabled { - return nil, errDeviceNotEnabled + return nil, device.ErrPluginDisabled } // Due to the asynchronous nature of NvidiaPlugin, there is a possibility @@ -221,7 +219,7 @@ func (d *NvidiaDevice) Reserve(deviceIDs []string) (*device.ContainerReservation // Stats streams statistics for the detected devices. func (d *NvidiaDevice) Stats(ctx context.Context, interval time.Duration) (<-chan *device.StatsResponse, error) { if !d.enabled { - return nil, errDeviceNotEnabled + return nil, device.ErrPluginDisabled } outCh := make(chan *device.StatsResponse) diff --git a/devices/gpu/nvidia/device_test.go b/devices/gpu/nvidia/device_test.go index 0554a0b4205..a5ec354e243 100644 --- a/devices/gpu/nvidia/device_test.go +++ b/devices/gpu/nvidia/device_test.go @@ -112,7 +112,7 @@ func TestReserve(t *testing.T) { { Name: "Device is disabled", ExpectedReservation: nil, - ExpectedError: errDeviceNotEnabled, + ExpectedError: device.ErrPluginDisabled, RequestedIDs: []string{ "UUID1", "UUID2", diff --git a/plugins/device/device.go b/plugins/device/device.go index c1e8b0bf406..32fa16133a7 100644 --- a/plugins/device/device.go +++ b/plugins/device/device.go @@ -15,6 +15,11 @@ const ( DeviceTypeGPU = "gpu" ) +var ( + // ErrPluginDisabled indicates that the device plugin is disabled + ErrPluginDisabled = fmt.Errorf("device is not enabled") +) + // DevicePlugin is the interface for a plugin that can expose detected devices // to Nomad and inform it how to mount them. type DevicePlugin interface {