From 7c7569674c9dca66a32b677c3e7cb60dd4f41e6e Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Thu, 18 Mar 2021 15:35:11 -0400 Subject: [PATCH] CSI: unique volume per allocation Add a `PerAlloc` field to volume requests that directs the scheduler to test feasibility for volumes with a source ID that includes the allocation index suffix (ex. `[0]`), rather than the exact source ID. Read the `PerAlloc` field when making the volume claim at the client to determine if the allocation index suffix (ex. `[0]`) should be added to the volume source ID. --- CHANGELOG.md | 1 + api/tasks.go | 1 + client/allocrunner/csi_hook.go | 15 +- command/agent/job_endpoint.go | 1 + e2e/csi/input/use-ebs-volume.nomad | 5 +- e2e/terraform/compute.tf | 2 +- e2e/terraform/terraform.tfvars | 2 +- e2e/terraform/volumes.tf | 4 +- nomad/state/state_store.go | 5 +- nomad/structs/diff_test.go | 7 + nomad/structs/funcs.go | 11 ++ nomad/structs/structs.go | 7 +- nomad/structs/structs_test.go | 22 +++ nomad/structs/volumes.go | 1 + scheduler/feasible.go | 21 ++- scheduler/feasible_test.go | 19 +- scheduler/generic_sched.go | 1 + scheduler/generic_sched_test.go | 167 ++++++++++++++++++ scheduler/stack.go | 5 +- scheduler/stack_test.go | 7 +- scheduler/system_sched.go | 2 +- scheduler/util.go | 5 +- .../github.com/hashicorp/nomad/api/tasks.go | 1 + .../content/docs/job-specification/update.mdx | 3 +- .../content/docs/job-specification/volume.mdx | 85 ++++----- 25 files changed, 329 insertions(+), 71 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 436f368fa1c..d5c20f5a964 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ BUG FIXES: IMPROVEMENTS: * cli: Update defaults for `nomad operator debug` flags `-interval` and `-server-id` to match common usage. [[GH-10121](https://github.com/hashicorp/nomad/issues/10121)] * consul/connect: Enable setting `local_bind_address` field on connect upstreams [[GH-6248](https://github.com/hashicorp/nomad/issues/6248)] + * csi: Added support for jobs to request a unique volume ID per allocation. [[GH-10136](https://github.com/hashicorp/nomad/issues/10136)] * driver/docker: Added support for optional extra container labels. [[GH-9885](https://github.com/hashicorp/nomad/issues/9885)] * driver/docker: Added support for configuring default logger behavior in the client configuration. [[GH-10156](https://github.com/hashicorp/nomad/issues/10156)] diff --git a/api/tasks.go b/api/tasks.go index ed268423b99..9c43df47625 100644 --- a/api/tasks.go +++ b/api/tasks.go @@ -382,6 +382,7 @@ type VolumeRequest struct { Source string `hcl:"source,optional"` ReadOnly bool `hcl:"read_only,optional"` MountOptions *CSIMountOptions `hcl:"mount_options,block"` + PerAlloc bool `hcl:"per_alloc,optional"` ExtraKeysHCL []string `hcl1:",unusedKeys,optional" json:"-"` } diff --git a/client/allocrunner/csi_hook.go b/client/allocrunner/csi_hook.go index 88eaddaecf8..83787455df6 100644 --- a/client/allocrunner/csi_hook.go +++ b/client/allocrunner/csi_hook.go @@ -92,8 +92,14 @@ func (c *csiHook) Postrun() error { mode = structs.CSIVolumeClaimWrite } + source := pair.request.Source + if pair.request.PerAlloc { + // NOTE: PerAlloc can't be set if we have canaries + source = source + structs.AllocSuffix(c.alloc.Name) + } + req := &structs.CSIVolumeUnpublishRequest{ - VolumeID: pair.request.Source, + VolumeID: source, Claim: &structs.CSIVolumeClaim{ AllocationID: c.alloc.ID, NodeID: c.alloc.NodeID, @@ -159,8 +165,13 @@ func (c *csiHook) claimVolumesFromAlloc() (map[string]*volumeAndRequest, error) claimType = structs.CSIVolumeClaimRead } + source := pair.request.Source + if pair.request.PerAlloc { + source = source + structs.AllocSuffix(c.alloc.Name) + } + req := &structs.CSIVolumeClaimRequest{ - VolumeID: pair.request.Source, + VolumeID: source, AllocationID: c.alloc.ID, NodeID: c.alloc.NodeID, Claim: claimType, diff --git a/command/agent/job_endpoint.go b/command/agent/job_endpoint.go index ec21072f91c..8e28be0fd07 100644 --- a/command/agent/job_endpoint.go +++ b/command/agent/job_endpoint.go @@ -944,6 +944,7 @@ func ApiTgToStructsTG(job *structs.Job, taskGroup *api.TaskGroup, tg *structs.Ta Type: v.Type, ReadOnly: v.ReadOnly, Source: v.Source, + PerAlloc: v.PerAlloc, } if v.MountOptions != nil { diff --git a/e2e/csi/input/use-ebs-volume.nomad b/e2e/csi/input/use-ebs-volume.nomad index 8811d659373..1f30f6f6efe 100644 --- a/e2e/csi/input/use-ebs-volume.nomad +++ b/e2e/csi/input/use-ebs-volume.nomad @@ -9,8 +9,9 @@ job "use-ebs-volume" { group "group" { volume "test" { - type = "csi" - source = "ebs-vol0" + type = "csi" + source = "ebs-vol" + per_alloc = true } task "task" { diff --git a/e2e/terraform/compute.tf b/e2e/terraform/compute.tf index 7e4299e622b..5d171f18c49 100644 --- a/e2e/terraform/compute.tf +++ b/e2e/terraform/compute.tf @@ -63,7 +63,7 @@ data "external" "packer_sha" { sha=$(git log -n 1 --pretty=format:%H packer) echo "{\"sha\":\"$${sha}\"}" EOT -] + ] } diff --git a/e2e/terraform/terraform.tfvars b/e2e/terraform/terraform.tfvars index 183bec55803..f0c0f949da2 100644 --- a/e2e/terraform/terraform.tfvars +++ b/e2e/terraform/terraform.tfvars @@ -15,5 +15,5 @@ nomad_local_binary = "" # overrides nomad_sha and nomad_version if set # Example overrides: # nomad_sha = "38e23b62a7700c96f4898be777543869499fea0a" -# nomad_local_binary = "../../pkg/linux_amd/nomad" +# nomad_local_binary = "../../pkg/linux_amd64/nomad" # nomad_local_binary_client_windows_2016_amd64 = ["../../pkg/windows_amd64/nomad.exe"] diff --git a/e2e/terraform/volumes.tf b/e2e/terraform/volumes.tf index cd4bc922147..572241c9c93 100644 --- a/e2e/terraform/volumes.tf +++ b/e2e/terraform/volumes.tf @@ -30,8 +30,8 @@ data "template_file" "ebs_volume_hcl" { count = var.volumes ? 1 : 0 template = < 0 { + mErr.Errors = append(mErr.Errors, + fmt.Errorf("Volume %s cannot be per_alloc when canaries are in use", name)) + } + if decl.Source == "" { mErr.Errors = append(mErr.Errors, fmt.Errorf("Volume %s has an empty source", name)) } diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go index 4787c5832d4..09a411f2461 100644 --- a/nomad/structs/structs_test.go +++ b/nomad/structs/structs_test.go @@ -1106,6 +1106,28 @@ func TestTaskGroup_Validate(t *testing.T) { err = tg.Validate(&Job{}) require.Contains(t, err.Error(), `Volume foo has an empty source`) + tg = &TaskGroup{ + Name: "group-a", + Update: &UpdateStrategy{ + Canary: 1, + }, + Volumes: map[string]*VolumeRequest{ + "foo": { + Type: "csi", + PerAlloc: true, + }, + }, + Tasks: []*Task{ + { + Name: "task-a", + Resources: &Resources{}, + }, + }, + } + err = tg.Validate(&Job{}) + require.Contains(t, err.Error(), `Volume foo has an empty source`) + require.Contains(t, err.Error(), `Volume foo cannot be per_alloc when canaries are in use`) + tg = &TaskGroup{ Volumes: map[string]*VolumeRequest{ "foo": { diff --git a/nomad/structs/volumes.go b/nomad/structs/volumes.go index 832c7af6653..ed6da134206 100644 --- a/nomad/structs/volumes.go +++ b/nomad/structs/volumes.go @@ -91,6 +91,7 @@ type VolumeRequest struct { Source string ReadOnly bool MountOptions *CSIMountOptions + PerAlloc bool } func (v *VolumeRequest) Copy() *VolumeRequest { diff --git a/scheduler/feasible.go b/scheduler/feasible.go index 0d6346084de..bafe0458f53 100644 --- a/scheduler/feasible.go +++ b/scheduler/feasible.go @@ -227,23 +227,30 @@ func (c *CSIVolumeChecker) SetNamespace(namespace string) { c.namespace = namespace } -func (c *CSIVolumeChecker) SetVolumes(volumes map[string]*structs.VolumeRequest) { +func (c *CSIVolumeChecker) SetVolumes(allocName string, volumes map[string]*structs.VolumeRequest) { + xs := make(map[string]*structs.VolumeRequest) + // Filter to only CSI Volumes for alias, req := range volumes { if req.Type != structs.VolumeTypeCSI { continue } - - xs[alias] = req + if req.PerAlloc { + // provide a unique volume source per allocation + copied := req.Copy() + copied.Source = copied.Source + structs.AllocSuffix(allocName) + xs[alias] = copied + } else { + xs[alias] = req + } } c.volumes = xs } func (c *CSIVolumeChecker) Feasible(n *structs.Node) bool { - hasPlugins, failReason := c.hasPlugins(n) - - if hasPlugins { + ok, failReason := c.isFeasible(n) + if ok { return true } @@ -251,7 +258,7 @@ func (c *CSIVolumeChecker) Feasible(n *structs.Node) bool { return false } -func (c *CSIVolumeChecker) hasPlugins(n *structs.Node) (bool, string) { +func (c *CSIVolumeChecker) isFeasible(n *structs.Node) (bool, string) { // We can mount the volume if // - if required, a healthy controller plugin is running the driver // - the volume has free claims, or this job owns the claims diff --git a/scheduler/feasible_test.go b/scheduler/feasible_test.go index 0e9ec8d7880..d8d4a12bf92 100644 --- a/scheduler/feasible_test.go +++ b/scheduler/feasible_test.go @@ -309,6 +309,13 @@ func TestCSIVolumeChecker(t *testing.T) { require.NoError(t, err) index++ + vid3 := "volume-id[0]" + vol3 := vol.Copy() + vol3.ID = vid3 + err = state.CSIVolumeRegister(index, []*structs.CSIVolume{vol3}) + require.NoError(t, err) + index++ + alloc := mock.Alloc() alloc.NodeID = nodes[4].ID alloc.Job.TaskGroups[0].Volumes = map[string]*structs.VolumeRequest{ @@ -332,11 +339,17 @@ func TestCSIVolumeChecker(t *testing.T) { noVolumes := map[string]*structs.VolumeRequest{} volumes := map[string]*structs.VolumeRequest{ - "baz": { + "shared": { Type: "csi", Name: "baz", Source: "volume-id", }, + "unique": { + Type: "csi", + Name: "baz", + Source: "volume-id[0]", + PerAlloc: true, + }, "nonsense": { Type: "host", Name: "nonsense", @@ -390,7 +403,7 @@ func TestCSIVolumeChecker(t *testing.T) { } for i, c := range cases { - checker.SetVolumes(c.RequestedVolumes) + checker.SetVolumes(alloc.Name, c.RequestedVolumes) if act := checker.Feasible(c.Node); act != c.Result { t.Fatalf("case(%d) failed: got %v; want %v", i, act, c.Result) } @@ -407,7 +420,7 @@ func TestCSIVolumeChecker(t *testing.T) { checker.SetNamespace(structs.DefaultNamespace) for _, node := range nodes { - checker.SetVolumes(volumes) + checker.SetVolumes(alloc.Name, volumes) act := checker.Feasible(node) require.False(t, act, "request with missing volume should never be feasible") } diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index 516fbf49c1b..852528874eb 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -547,6 +547,7 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul // Compute penalty nodes for rescheduled allocs selectOptions := getSelectOptions(prevAllocation, preferredNode) + selectOptions.AllocName = missing.Name() option := s.selectNextOption(tg, selectOptions) // Store the available nodes by datacenter diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go index 842a85b29c6..7c5f7cd7a52 100644 --- a/scheduler/generic_sched_test.go +++ b/scheduler/generic_sched_test.go @@ -5779,3 +5779,170 @@ func TestServiceSched_RunningWithNextAllocation(t *testing.T) { require.Len(t, allocsByVersion[1], 2) require.Len(t, allocsByVersion[0], 3) } + +func TestServiceSched_CSIVolumesPerAlloc(t *testing.T) { + h := NewHarness(t) + require := require.New(t) + + // Create some nodes, each running the CSI plugin + for i := 0; i < 5; i++ { + node := mock.Node() + node.CSINodePlugins = map[string]*structs.CSIInfo{ + "test-plugin": { + PluginID: "test-plugin", + Healthy: true, + NodeInfo: &structs.CSINodeInfo{MaxVolumes: 2}, + }, + } + require.NoError(h.State.UpsertNode( + structs.MsgTypeTestSetup, h.NextIndex(), node)) + } + + // create per-alloc volumes + vol0 := structs.NewCSIVolume("volume-unique[0]", 0) + vol0.PluginID = "test-plugin" + vol0.Namespace = structs.DefaultNamespace + vol0.AccessMode = structs.CSIVolumeAccessModeSingleNodeWriter + vol0.AttachmentMode = structs.CSIVolumeAttachmentModeFilesystem + + vol1 := vol0.Copy() + vol1.ID = "volume-unique[1]" + vol2 := vol0.Copy() + vol2.ID = "volume-unique[2]" + + // create shared volume + shared := vol0.Copy() + shared.ID = "volume-shared" + // TODO: this should cause a test failure, see GH-10157 + // replace this value with structs.CSIVolumeAccessModeSingleNodeWriter + // once its been fixed + shared.AccessMode = structs.CSIVolumeAccessModeMultiNodeReader + + require.NoError(h.State.CSIVolumeRegister( + h.NextIndex(), []*structs.CSIVolume{shared, vol0, vol1, vol2})) + + // Create a job that uses both + job := mock.Job() + job.TaskGroups[0].Count = 3 + job.TaskGroups[0].Volumes = map[string]*structs.VolumeRequest{ + "shared": { + Type: "csi", + Name: "shared", + Source: "volume-shared", + ReadOnly: true, + }, + "unique": { + Type: "csi", + Name: "unique", + Source: "volume-unique", + PerAlloc: true, + }, + } + + require.NoError(h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + // Create a mock evaluation to register the job + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: job.Priority, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + + require.NoError(h.State.UpsertEvals(structs.MsgTypeTestSetup, + h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation and expect a single plan without annotations + err := h.Process(NewServiceScheduler, eval) + require.NoError(err) + require.Len(h.Plans, 1, "expected one plan") + require.Nil(h.Plans[0].Annotations, "expected no annotations") + + // Expect the eval has not spawned a blocked eval + require.Equal(len(h.CreateEvals), 0) + require.Equal("", h.Evals[0].BlockedEval, "did not expect a blocked eval") + require.Equal(structs.EvalStatusComplete, h.Evals[0].Status) + + // Ensure the plan allocated and we got expected placements + var planned []*structs.Allocation + for _, allocList := range h.Plans[0].NodeAllocation { + planned = append(planned, allocList...) + } + require.Len(planned, 3, "expected 3 planned allocations") + + out, err := h.State.AllocsByJob(nil, job.Namespace, job.ID, false) + require.NoError(err) + require.Len(out, 3, "expected 3 placed allocations") + + // Allocations don't have references to the actual volumes assigned, but + // because we set a max of 2 volumes per Node plugin, we can verify that + // they've been properly scheduled by making sure they're all on separate + // clients. + seen := map[string]struct{}{} + for _, alloc := range out { + _, ok := seen[alloc.NodeID] + require.False(ok, "allocations should be scheduled to separate nodes") + seen[alloc.NodeID] = struct{}{} + } + + // Update the job to 5 instances + job.TaskGroups[0].Count = 5 + require.NoError(h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + // Create a new eval and process it. It should not create a new plan. + eval.ID = uuid.Generate() + require.NoError(h.State.UpsertEvals(structs.MsgTypeTestSetup, + h.NextIndex(), []*structs.Evaluation{eval})) + err = h.Process(NewServiceScheduler, eval) + require.NoError(err) + require.Len(h.Plans, 1, "expected one plan") + + // Expect the eval to have failed + require.NotEqual("", h.Evals[1].BlockedEval, + "expected a blocked eval to be spawned") + require.Equal(2, h.Evals[1].QueuedAllocations["web"], "expected 2 queued allocs") + require.Equal(1, h.Evals[1].FailedTGAllocs["web"]. + ConstraintFiltered["missing CSI Volume volume-unique[3]"]) + + // Upsert 2 more per-alloc volumes + vol4 := vol0.Copy() + vol4.ID = "volume-unique[3]" + vol5 := vol0.Copy() + vol5.ID = "volume-unique[4]" + require.NoError(h.State.CSIVolumeRegister( + h.NextIndex(), []*structs.CSIVolume{vol4, vol5})) + + // Process again with failure fixed. It should create a new plan + eval.ID = uuid.Generate() + require.NoError(h.State.UpsertEvals(structs.MsgTypeTestSetup, + h.NextIndex(), []*structs.Evaluation{eval})) + err = h.Process(NewServiceScheduler, eval) + require.NoError(err) + require.Len(h.Plans, 2, "expected two plans") + require.Nil(h.Plans[1].Annotations, "expected no annotations") + + require.Equal("", h.Evals[2].BlockedEval, "did not expect a blocked eval") + require.Len(h.Evals[2].FailedTGAllocs, 0) + + // Ensure the plan allocated and we got expected placements + planned = []*structs.Allocation{} + for _, allocList := range h.Plans[1].NodeAllocation { + planned = append(planned, allocList...) + } + require.Len(planned, 2, "expected 2 new planned allocations") + + out, err = h.State.AllocsByJob(nil, job.Namespace, job.ID, false) + require.NoError(err) + require.Len(out, 5, "expected 5 placed allocations total") + + // Make sure they're still all on seperate clients + seen = map[string]struct{}{} + for _, alloc := range out { + _, ok := seen[alloc.NodeID] + require.False(ok, "allocations should be scheduled to separate nodes") + seen[alloc.NodeID] = struct{}{} + } + +} diff --git a/scheduler/stack.go b/scheduler/stack.go index bccabc7899a..f362df9a9c0 100644 --- a/scheduler/stack.go +++ b/scheduler/stack.go @@ -35,6 +35,7 @@ type SelectOptions struct { PenaltyNodeIDs map[string]struct{} PreferredNodes []*structs.Node Preempt bool + AllocName string } // GenericStack is the Stack used for the Generic scheduler. It is @@ -143,7 +144,7 @@ func (s *GenericStack) Select(tg *structs.TaskGroup, options *SelectOptions) *Ra s.taskGroupConstraint.SetConstraints(tgConstr.constraints) s.taskGroupDevices.SetTaskGroup(tg) s.taskGroupHostVolumes.SetVolumes(tg.Volumes) - s.taskGroupCSIVolumes.SetVolumes(tg.Volumes) + s.taskGroupCSIVolumes.SetVolumes(options.AllocName, tg.Volumes) if len(tg.Networks) > 0 { s.taskGroupNetwork.SetNetwork(tg.Networks[0]) } @@ -297,7 +298,7 @@ func (s *SystemStack) Select(tg *structs.TaskGroup, options *SelectOptions) *Ran s.taskGroupConstraint.SetConstraints(tgConstr.constraints) s.taskGroupDevices.SetTaskGroup(tg) s.taskGroupHostVolumes.SetVolumes(tg.Volumes) - s.taskGroupCSIVolumes.SetVolumes(tg.Volumes) + s.taskGroupCSIVolumes.SetVolumes(options.AllocName, tg.Volumes) if len(tg.Networks) > 0 { s.taskGroupNetwork.SetNetwork(tg.Networks[0]) } diff --git a/scheduler/stack_test.go b/scheduler/stack_test.go index 4650546d32f..4750b668aa2 100644 --- a/scheduler/stack_test.go +++ b/scheduler/stack_test.go @@ -240,7 +240,7 @@ func TestServiceStack_Select_CSI(t *testing.T) { // Create a volume in the state store index := uint64(999) - v := structs.NewCSIVolume("foo", index) + v := structs.NewCSIVolume("foo[0]", index) v.Namespace = structs.DefaultNamespace v.AccessMode = structs.CSIVolumeAccessModeMultiNodeSingleWriter v.AttachmentMode = structs.CSIVolumeAttachmentModeFilesystem @@ -284,16 +284,19 @@ func TestServiceStack_Select_CSI(t *testing.T) { stack.SetNodes(nodes) job := mock.Job() + job.TaskGroups[0].Count = 2 job.TaskGroups[0].Volumes = map[string]*structs.VolumeRequest{"foo": { Name: "bar", Type: structs.VolumeTypeCSI, Source: "foo", ReadOnly: true, + PerAlloc: true, }} stack.SetJob(job) - selectOptions := &SelectOptions{} + selectOptions := &SelectOptions{ + AllocName: structs.AllocName(job.Name, job.TaskGroups[0].Name, 0)} node := stack.Select(job.TaskGroups[0], selectOptions) if node == nil { t.Fatalf("missing node %#v", ctx.Metrics()) diff --git a/scheduler/system_sched.go b/scheduler/system_sched.go index 4b1e5c8cbfa..22684beb8af 100644 --- a/scheduler/system_sched.go +++ b/scheduler/system_sched.go @@ -284,7 +284,7 @@ func (s *SystemScheduler) computePlacements(place []allocTuple) error { s.stack.SetNodes(nodes) // Attempt to match the task group - option := s.stack.Select(missing.TaskGroup, nil) + option := s.stack.Select(missing.TaskGroup, &SelectOptions{AllocName: missing.Name}) if option == nil { // If the task can't be placed on this node, update reporting data diff --git a/scheduler/util.go b/scheduler/util.go index 777c0e00f2b..86461a8f655 100644 --- a/scheduler/util.go +++ b/scheduler/util.go @@ -695,7 +695,8 @@ func inplaceUpdate(ctx Context, eval *structs.Evaluation, job *structs.Job, ctx.Plan().AppendStoppedAlloc(update.Alloc, allocInPlace, "", "") // Attempt to match the task group - option := stack.Select(update.TaskGroup, nil) // This select only looks at one node so we don't pass selectOptions + option := stack.Select(update.TaskGroup, + &SelectOptions{AllocName: update.Alloc.Name}) // Pop the allocation ctx.Plan().PopUpdate(update.Alloc) @@ -977,7 +978,7 @@ func genericAllocUpdateFn(ctx Context, stack Stack, evalID string) allocUpdateTy ctx.Plan().AppendStoppedAlloc(existing, allocInPlace, "", "") // Attempt to match the task group - option := stack.Select(newTG, nil) // This select only looks at one node so we don't pass selectOptions + option := stack.Select(newTG, &SelectOptions{AllocName: existing.Name}) // Pop the allocation ctx.Plan().PopUpdate(existing) diff --git a/vendor/github.com/hashicorp/nomad/api/tasks.go b/vendor/github.com/hashicorp/nomad/api/tasks.go index ed268423b99..9c43df47625 100644 --- a/vendor/github.com/hashicorp/nomad/api/tasks.go +++ b/vendor/github.com/hashicorp/nomad/api/tasks.go @@ -382,6 +382,7 @@ type VolumeRequest struct { Source string `hcl:"source,optional"` ReadOnly bool `hcl:"read_only,optional"` MountOptions *CSIMountOptions `hcl:"mount_options,block"` + PerAlloc bool `hcl:"per_alloc,optional"` ExtraKeysHCL []string `hcl1:",unusedKeys,optional" json:"-"` } diff --git a/website/content/docs/job-specification/update.mdx b/website/content/docs/job-specification/update.mdx index 74a5bd9146f..cd779693274 100644 --- a/website/content/docs/job-specification/update.mdx +++ b/website/content/docs/job-specification/update.mdx @@ -105,7 +105,8 @@ a future release. destructive updates should create the specified number of canaries without stopping any previous allocations. Once the operator determines the canaries are healthy, they can be promoted which unblocks a rolling update of the - remaining allocations at a rate of `max_parallel`. + remaining allocations at a rate of `max_parallel`. Canary deployments cannot + be used with CSI volumes when `per_alloc = true`. - `stagger` `(string: "30s")` - Specifies the delay between each set of [`max_parallel`](#max_parallel) updates when updating system jobs. This diff --git a/website/content/docs/job-specification/volume.mdx b/website/content/docs/job-specification/volume.mdx index 30aa010767e..662e0bae52b 100644 --- a/website/content/docs/job-specification/volume.mdx +++ b/website/content/docs/job-specification/volume.mdx @@ -49,6 +49,13 @@ the [volume_mount][volume_mount] stanza in the `task` configuration. name of the host volume. When using `csi` volumes, this should match the ID of the registered volume. +- `per_alloc` `(bool: false)` - Specifies that the `source` of the volume + should have the suffix `[n]`, where `n` is the allocation index. This allows + mounting a unique volume per allocation, so long as the volume's source is + named appropriately. For example, with the source `myvolume` and `per_alloc + = true`, the allocation named `myjob.mygroup.mytask[0]` will require a + volume ID `myvolume[0]`. + - `read_only` `(bool: false)` - Specifies that the group only requires read only access to a volume and is used as the default value for the `volume_mount -> read_only` configuration. This value is also @@ -74,64 +81,62 @@ There are two limitations to using HCL2 interpolation for `volume` blocks: - The `volume` block is used to schedule workloads, so any interpolation needs to be done before placement. This means that variables like - `NOMAD_ALLOC_INDEX` can't be used for interpolation. + `NOMAD_ALLOC_INDEX` can't be used for interpolation. Use the `per_alloc` + field described above. - Nomad does not yet support dynamic volume creation (see [GH-8212]), so volumes must be created and registered before being used as a `volume.source`. The following job specification demonstrates how to use multiple volumes with -multiple allocations. It uses a `dynamic` block to create a new task group for -each of the two volumes. This job specification also shows using HCL2 -variables interpolation to expose information to the task's environment. +multiple allocations, using the `per_alloc` field. This job specification also +shows using HCL2 -variables interpolation to expose information to the task's +environment. ```hcl variables { - volume_index = ["0", "1"] - path = "test" + path = "test" } job "example" { datacenters = ["dc1"] - dynamic "group" { - for_each = var.volume_index - labels = ["cache-${group.value}"] + group "cache" { + + count = 2 - content { + volume "cache-volume" { + type = "csi" + source = "test-volume" + per_alloc = true + } - volume "cache-volume" { - type = "csi" - source = "test-volume${group.value}" + network { + port "db" { + to = 6379 } + } - network { - port "db" { - to = 6379 - } + task "redis" { + driver = "docker" + config { + image = "redis:3.2" + ports = ["db"] + } + resources { + cpu = 500 + memory = 256 } - task "redis" { - driver = "docker" - config { - image = "redis:3.2" - ports = ["db"] - } - resources { - cpu = 500 - memory = 256 - } - - env { - # this will be available as the MOUNT_PATH environment - # variable in the task - MOUNT_PATH = "${NOMAD_ALLOC_DIR}/${var.path}" - } - - volume_mount { - volume = "cache-volume" - destination = "${NOMAD_ALLOC_DIR}/${var.path}" - } + env { + # this will be available as the MOUNT_PATH environment + # variable in the task + MOUNT_PATH = "${NOMAD_ALLOC_DIR}/${var.path}" + } + volume_mount { + volume = "cache-volume" + destination = "${NOMAD_ALLOC_DIR}/${var.path}" } + } } } @@ -150,13 +155,13 @@ ID Node ID Task Group Version Desired Status Created Modified 81d32909 352c6926 cache-1 0 run running 4s ago 3s ago ce6fbfc8 352c6926 cache-0 0 run running 4s ago 3s ago -$ nomad volume status test-volume0 +$ nomad volume status 'test-volume[0]' ... Allocations ID Node ID Task Group Version Desired Status Created Modified ce6fbfc8 352c6926 cache-0 0 run running 29s ago 18s ago -$ nomad volume status test-volume1 +$ nomad volume status 'test-volume[1]' ... Allocations ID Node ID Task Group Version Desired Status Created Modified