From 60dd7aecc9e4beeb282bbe9ba203eb5d5be6fcc0 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Tue, 21 Jul 2020 10:11:16 -0400 Subject: [PATCH] nvidia: support disabling the nvidia plugin (#8353) --- client/devicemanager/instance.go | 6 +++- devices/gpu/nvidia/device.go | 24 ++++++++++++++-- devices/gpu/nvidia/device_test.go | 46 ++++++++++++++++++++++++------- plugins/device/device.go | 5 ++++ 4 files changed, 68 insertions(+), 13 deletions(-) diff --git a/client/devicemanager/instance.go b/client/devicemanager/instance.go index 3119f7207df..edeeb2e8eb7 100644 --- a/client/devicemanager/instance.go +++ b/client/devicemanager/instance.go @@ -336,7 +336,11 @@ START: // Start fingerprinting fingerprintCh, err := devicePlugin.Fingerprint(i.ctx) - if err != nil { + if err == device.ErrPluginDisabled { + i.logger.Info("fingerprinting failed: plugin is not enabled") + i.handleFingerprintError() + return + } else if err != nil { i.logger.Error("fingerprinting failed", "error", err) i.handleFingerprintError() return diff --git a/devices/gpu/nvidia/device.go b/devices/gpu/nvidia/device.go index 064161cf5c5..67680dc2a0e 100644 --- a/devices/gpu/nvidia/device.go +++ b/devices/gpu/nvidia/device.go @@ -28,9 +28,7 @@ const ( // notAvailable value is returned to nomad server in case some properties were // undetected by nvml driver notAvailable = "N/A" -) -const ( // Nvidia-container-runtime environment variable names NvidiaVisibleDevices = "NVIDIA_VISIBLE_DEVICES" ) @@ -59,6 +57,10 @@ var ( // configSpec is the specification of the plugin's configuration configSpec = hclspec.NewObject(map[string]*hclspec.Spec{ + "enabled": hclspec.NewDefault( + hclspec.NewAttr("enabled", "bool", false), + hclspec.NewLiteral("true"), + ), "ignored_gpu_ids": hclspec.NewDefault( hclspec.NewAttr("ignored_gpu_ids", "list(string)", false), hclspec.NewLiteral("[]"), @@ -72,12 +74,16 @@ var ( // Config contains configuration information for the plugin. type Config struct { + Enabled bool `codec:"enabled"` IgnoredGPUIDs []string `codec:"ignored_gpu_ids"` FingerprintPeriod string `codec:"fingerprint_period"` } // NvidiaDevice contains all plugin specific data type NvidiaDevice struct { + // enabled indicates whether the plugin should be enabled + enabled bool + // nvmlClient is used to get data from nvidia nvmlClient nvml.NvmlClient @@ -133,6 +139,8 @@ func (d *NvidiaDevice) SetConfig(cfg *base.Config) error { } } + d.enabled = config.Enabled + for _, ignoredGPUId := range config.IgnoredGPUIDs { d.ignoredGPUIDs[ignoredGPUId] = struct{}{} } @@ -149,6 +157,10 @@ func (d *NvidiaDevice) SetConfig(cfg *base.Config) error { // Fingerprint streams detected devices. If device changes are detected or the // devices health changes, messages will be emitted. func (d *NvidiaDevice) Fingerprint(ctx context.Context) (<-chan *device.FingerprintResponse, error) { + if !d.enabled { + return nil, device.ErrPluginDisabled + } + outCh := make(chan *device.FingerprintResponse) go d.fingerprint(ctx, outCh) return outCh, nil @@ -169,6 +181,10 @@ func (d *NvidiaDevice) Reserve(deviceIDs []string) (*device.ContainerReservation if len(deviceIDs) == 0 { return &device.ContainerReservation{}, nil } + if !d.enabled { + return nil, device.ErrPluginDisabled + } + // Due to the asynchronous nature of NvidiaPlugin, there is a possibility // of race condition // @@ -202,6 +218,10 @@ func (d *NvidiaDevice) Reserve(deviceIDs []string) (*device.ContainerReservation // Stats streams statistics for the detected devices. func (d *NvidiaDevice) Stats(ctx context.Context, interval time.Duration) (<-chan *device.StatsResponse, error) { + if !d.enabled { + return nil, device.ErrPluginDisabled + } + outCh := make(chan *device.StatsResponse) go d.stats(ctx, outCh, interval) return outCh, nil diff --git a/devices/gpu/nvidia/device_test.go b/devices/gpu/nvidia/device_test.go index 717491f2b69..a5ec354e243 100644 --- a/devices/gpu/nvidia/device_test.go +++ b/devices/gpu/nvidia/device_test.go @@ -26,7 +26,7 @@ func (c *MockNvmlClient) GetStatsData() ([]*nvml.StatsData, error) { } func TestReserve(t *testing.T) { - for _, testCase := range []struct { + cases := []struct { Name string ExpectedReservation *device.ContainerReservation ExpectedError error @@ -47,7 +47,8 @@ func TestReserve(t *testing.T) { "UUID3", }, Device: &NvidiaDevice{ - logger: hclog.NewNullLogger(), + logger: hclog.NewNullLogger(), + enabled: true, }, }, { @@ -66,7 +67,8 @@ func TestReserve(t *testing.T) { devices: map[string]struct{}{ "UUID3": {}, }, - logger: hclog.NewNullLogger(), + logger: hclog.NewNullLogger(), + enabled: true, }, }, { @@ -88,7 +90,8 @@ func TestReserve(t *testing.T) { "UUID2": {}, "UUID3": {}, }, - logger: hclog.NewNullLogger(), + logger: hclog.NewNullLogger(), + enabled: true, }, }, { @@ -102,13 +105,36 @@ func TestReserve(t *testing.T) { "UUID2": {}, "UUID3": {}, }, - logger: hclog.NewNullLogger(), + logger: hclog.NewNullLogger(), + enabled: true, }, }, - } { - actualReservation, actualError := testCase.Device.Reserve(testCase.RequestedIDs) - req := require.New(t) - req.Equal(testCase.ExpectedReservation, actualReservation) - req.Equal(testCase.ExpectedError, actualError) + { + Name: "Device is disabled", + ExpectedReservation: nil, + ExpectedError: device.ErrPluginDisabled, + RequestedIDs: []string{ + "UUID1", + "UUID2", + "UUID3", + }, + Device: &NvidiaDevice{ + devices: map[string]struct{}{ + "UUID1": {}, + "UUID2": {}, + "UUID3": {}, + }, + logger: hclog.NewNullLogger(), + enabled: false, + }, + }, + } + + for _, c := range cases { + t.Run(c.Name, func(t *testing.T) { + actualReservation, actualError := c.Device.Reserve(c.RequestedIDs) + require.Equal(t, c.ExpectedReservation, actualReservation) + require.Equal(t, c.ExpectedError, actualError) + }) } } diff --git a/plugins/device/device.go b/plugins/device/device.go index c1e8b0bf406..32fa16133a7 100644 --- a/plugins/device/device.go +++ b/plugins/device/device.go @@ -15,6 +15,11 @@ const ( DeviceTypeGPU = "gpu" ) +var ( + // ErrPluginDisabled indicates that the device plugin is disabled + ErrPluginDisabled = fmt.Errorf("device is not enabled") +) + // DevicePlugin is the interface for a plugin that can expose detected devices // to Nomad and inform it how to mount them. type DevicePlugin interface {