diff --git a/client/allocrunner/taskrunner/plugin_supervisor_hook.go b/client/allocrunner/taskrunner/plugin_supervisor_hook.go index 50713fc075c..4696bc53f38 100644 --- a/client/allocrunner/taskrunner/plugin_supervisor_hook.go +++ b/client/allocrunner/taskrunner/plugin_supervisor_hook.go @@ -103,6 +103,10 @@ func newCSIPluginSupervisorHook(config *csiPluginSupervisorHookConfig) *csiPlugi socketMountPoint := filepath.Join(config.clientStateDirPath, "csi", "plugins", config.runner.Alloc().ID) + if task.CSIPluginConfig.HealthTimeout == 0 { + task.CSIPluginConfig.HealthTimeout = 30 * time.Second + } + shutdownCtx, cancelFn := context.WithCancel(context.Background()) hook := &csiPluginSupervisorHook{ diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index c60afd713b8..2692f7cb8a1 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -7258,10 +7258,6 @@ func (t *Task) Validate(ephemeralDisk *EphemeralDisk, jobType string, tgServices mErr.Errors = append(mErr.Errors, fmt.Errorf("CSIPluginConfig PluginType must be one of 'node', 'controller', or 'monolith', got: \"%s\"", t.CSIPluginConfig.Type)) } - if t.CSIPluginConfig.HealthTimeout == 0 { - t.CSIPluginConfig.HealthTimeout = 30 * time.Second - } - // TODO: Investigate validation of the PluginMountDir. Not much we can do apart from check IsAbs until after we understand its execution environment though :( } diff --git a/website/content/docs/job-specification/csi_plugin.mdx b/website/content/docs/job-specification/csi_plugin.mdx index 55cf152c599..811a4cd5ce6 100644 --- a/website/content/docs/job-specification/csi_plugin.mdx +++ b/website/content/docs/job-specification/csi_plugin.mdx @@ -17,9 +17,10 @@ to claim [volumes][csi_volumes]. ```hcl csi_plugin { - id = "csi-hostpath" - type = "monolith" - mount_dir = "/csi" + id = "csi-hostpath" + type = "monolith" + mount_dir = "/csi" + health_timeout = "30s" } ``` @@ -43,6 +44,11 @@ csi_plugin { container where the plugin will expect a Unix domain socket for bidirectional communication with Nomad. +- `health_timeout` `(duration: )` - The duration that + the plugin supervisor will wait before restarting an unhealthy + CSI plugin. Must be a duration value such as `30s` or `2m`. + Defaults to `30s` if not set. + ~> **Note:** Plugins running as `node` or `monolith` require root privileges (or `CAP_SYS_ADMIN` on Linux) to mount volumes on the host. With the Docker task driver, you can use the `privileged = true` @@ -111,10 +117,11 @@ job "plugin-efs" { } csi_plugin { - id = "aws-efs0" - type = "node" - mount_dir = "/csi" # this path /csi matches the --endpoint + id = "aws-efs0" + type = "node" + mount_dir = "/csi" # this path /csi matches the --endpoint # argument for the container + health_timeout = "30s" } } }