diff --git a/.changelog/13340.txt b/.changelog/13340.txt new file mode 100644 index 00000000000..1f8e40f21b9 --- /dev/null +++ b/.changelog/13340.txt @@ -0,0 +1,3 @@ +```release-note:improvements +csi: Made the CSI Plugin supervisor health check configurable with a new CSI Stanza health_timeout field +``` diff --git a/api/tasks.go b/api/tasks.go index efc1b87af9d..33a4acc16b1 100644 --- a/api/tasks.go +++ b/api/tasks.go @@ -1018,10 +1018,18 @@ type TaskCSIPluginConfig struct { // // Default is /csi. MountDir string `mapstructure:"mount_dir" hcl:"mount_dir,optional"` + + // HealthTimeout is the time after which the CSI plugin tasks will be killed + // if the CSI Plugin is not healthy. + HealthTimeout time.Duration `mapstructure:"health_timeout" hcl:"health_timeout,optional"` } func (t *TaskCSIPluginConfig) Canonicalize() { if t.MountDir == "" { t.MountDir = "/csi" } + + if t.HealthTimeout == 0 { + t.HealthTimeout = 30 * time.Second + } } diff --git a/client/allocrunner/taskrunner/plugin_supervisor_hook.go b/client/allocrunner/taskrunner/plugin_supervisor_hook.go index 0903e1487e9..c7cfa2454b4 100644 --- a/client/allocrunner/taskrunner/plugin_supervisor_hook.go +++ b/client/allocrunner/taskrunner/plugin_supervisor_hook.go @@ -88,6 +88,10 @@ func newCSIPluginSupervisorHook(config *csiPluginSupervisorHookConfig) *csiPlugi pluginRoot := filepath.Join(config.clientStateDirPath, "csi", string(task.CSIPluginConfig.Type), task.CSIPluginConfig.ID) + if task.CSIPluginConfig.HealthTimeout == 0 { + task.CSIPluginConfig.HealthTimeout = 30 * time.Second + } + shutdownCtx, cancelFn := context.WithCancel(context.Background()) hook := &csiPluginSupervisorHook{ @@ -231,7 +235,7 @@ func (h *csiPluginSupervisorHook) ensureSupervisorLoop(ctx context.Context) { // We're in Poststart at this point, so if we can't connect within // this deadline, assume it's broken so we can restart the task - startCtx, startCancelFn := context.WithTimeout(ctx, 30*time.Second) + startCtx, startCancelFn := context.WithTimeout(ctx, h.task.CSIPluginConfig.HealthTimeout) defer startCancelFn() var err error @@ -414,7 +418,7 @@ func (h *csiPluginSupervisorHook) kill(ctx context.Context, reason error) { if err := h.lifecycle.Kill(ctx, structs.NewTaskEvent(structs.TaskKilling). SetFailsTask(). - SetDisplayMessage("CSI plugin did not become healthy before timeout"), + SetDisplayMessage(fmt.Sprintf("CSI plugin did not become healthy before configured %v health timeout", h.task.CSIPluginConfig.HealthTimeout.String())), ); err != nil { h.logger.Error("failed to kill task", "kill_reason", reason, "error", err) } diff --git a/command/agent/job_endpoint.go b/command/agent/job_endpoint.go index 8a99a77b297..4de07d9c1cc 100644 --- a/command/agent/job_endpoint.go +++ b/command/agent/job_endpoint.go @@ -1166,6 +1166,7 @@ func ApiCSIPluginConfigToStructsCSIPluginConfig(apiConfig *api.TaskCSIPluginConf sc.ID = apiConfig.ID sc.Type = structs.CSIPluginType(apiConfig.Type) sc.MountDir = apiConfig.MountDir + sc.HealthTimeout = apiConfig.HealthTimeout return sc } diff --git a/jobspec/parse_task.go b/jobspec/parse_task.go index ff81b6ba66a..4bc77c310f2 100644 --- a/jobspec/parse_task.go +++ b/jobspec/parse_task.go @@ -158,12 +158,20 @@ func parseTask(item *ast.ObjectItem, keys []string) (*api.Task, error) { i := o.Elem().Items[0] var m map[string]interface{} + var cfg api.TaskCSIPluginConfig if err := hcl.DecodeObject(&m, i.Val); err != nil { return nil, err } - var cfg api.TaskCSIPluginConfig - if err := mapstructure.WeakDecode(m, &cfg); err != nil { + dec, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{ + DecodeHook: mapstructure.StringToTimeDurationHookFunc(), + WeaklyTypedInput: true, + Result: &cfg, + }) + if err != nil { + return nil, err + } + if err := dec.Decode(m); err != nil { return nil, err } diff --git a/jobspec/parse_test.go b/jobspec/parse_test.go index 389d01245d1..1af37804371 100644 --- a/jobspec/parse_test.go +++ b/jobspec/parse_test.go @@ -625,9 +625,10 @@ func TestParse(t *testing.T) { Name: "binstore", Driver: "docker", CSIPluginConfig: &api.TaskCSIPluginConfig{ - ID: "org.hashicorp.csi", - Type: api.CSIPluginTypeMonolith, - MountDir: "/csi/test", + ID: "org.hashicorp.csi", + Type: api.CSIPluginTypeMonolith, + MountDir: "/csi/test", + HealthTimeout: 1 * time.Minute, }, }, }, diff --git a/jobspec/test-fixtures/csi-plugin.hcl b/jobspec/test-fixtures/csi-plugin.hcl index b879da18434..3e4106719d5 100644 --- a/jobspec/test-fixtures/csi-plugin.hcl +++ b/jobspec/test-fixtures/csi-plugin.hcl @@ -4,9 +4,10 @@ job "binstore-storagelocker" { driver = "docker" csi_plugin { - id = "org.hashicorp.csi" - type = "monolith" - mount_dir = "/csi/test" + id = "org.hashicorp.csi" + type = "monolith" + mount_dir = "/csi/test" + health_timeout = "1m" } } } diff --git a/nomad/structs/csi.go b/nomad/structs/csi.go index 9f90335ac7d..d8eae4d3ea9 100644 --- a/nomad/structs/csi.go +++ b/nomad/structs/csi.go @@ -66,6 +66,10 @@ type TaskCSIPluginConfig struct { // to be created by the plugin, and will provide references into // "MountDir/CSIIntermediaryDirname/{VolumeName}/{AllocID} for mounts. MountDir string + + // HealthTimeout is the time after which the CSI plugin tasks will be killed + // if the CSI Plugin is not healthy. + HealthTimeout time.Duration `mapstructure:"health_timeout" hcl:"health_timeout,optional"` } func (t *TaskCSIPluginConfig) Copy() *TaskCSIPluginConfig { diff --git a/website/content/docs/job-specification/csi_plugin.mdx b/website/content/docs/job-specification/csi_plugin.mdx index b72b8998c85..ddee5bfe5f1 100644 --- a/website/content/docs/job-specification/csi_plugin.mdx +++ b/website/content/docs/job-specification/csi_plugin.mdx @@ -17,9 +17,10 @@ to claim [volumes][csi_volumes]. ```hcl csi_plugin { - id = "csi-hostpath" - type = "monolith" - mount_dir = "/csi" + id = "csi-hostpath" + type = "monolith" + mount_dir = "/csi" + health_timeout = "30s" } ``` @@ -43,6 +44,11 @@ csi_plugin { container where the plugin will expect a Unix domain socket for bidirectional communication with Nomad. +- `health_timeout` `(duration: )` - The duration that + the plugin supervisor will wait before restarting an unhealthy + CSI plugin. Must be a duration value such as `30s` or `2m`. + Defaults to `30s` if not set. + ~> **Note:** Plugins running as `node` or `monolith` require root privileges (or `CAP_SYS_ADMIN` on Linux) to mount volumes on the host. With the Docker task driver, you can use the `privileged = true` @@ -112,10 +118,11 @@ job "plugin-efs" { } csi_plugin { - id = "aws-efs0" - type = "node" - mount_dir = "/csi" # this path /csi matches the --endpoint + id = "aws-efs0" + type = "node" + mount_dir = "/csi" # this path /csi matches the --endpoint # argument for the container + health_timeout = "30s" } } }