From 23604e0d3dfdfb57deb4d7f26bb46701297d8386 Mon Sep 17 00:00:00 2001 From: Seth Hoenig Date: Mon, 28 Sep 2020 10:48:28 -0500 Subject: [PATCH] consul: fix validation of task in group-level script-checks When defining a script-check in a group-level service, Nomad needs to know which task is associated with the check so that it can use the correct task driver to execute the check. This PR fixes two bugs: 1) validate service.task or service.check.task is configured 2) make service.check.task inherit service.task if it is itself unset Fixes #8952 --- CHANGELOG.md | 1 + .../taskrunner/script_check_hook.go | 22 ++++++- .../taskrunner/script_check_hook_test.go | 24 +++++++ nomad/job_endpoint_test.go | 1 + nomad/structs/structs.go | 26 ++++++++ nomad/structs/structs_test.go | 66 +++++++++++++++++++ .../pages/docs/job-specification/service.mdx | 4 +- 7 files changed, 141 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 92a6b6e3d3a..2d7d826002a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ IMPROVEMENTS: BUG FIXES: * core: Fixed a bug where blocking queries would not include the query's maximum wait time when calculating whether it was safe to retry. [[GH-8921](https://github.com/hashicorp/nomad/issues/8921)] + * consul: Fixed a bug to correctly validate task when using script-checks in group-level services [[GH-8952](https://github.com/hashicorp/nomad/issues/8952)] ## 0.12.5 (September 17, 2020) diff --git a/client/allocrunner/taskrunner/script_check_hook.go b/client/allocrunner/taskrunner/script_check_hook.go index f10486ac526..58d891089c9 100644 --- a/client/allocrunner/taskrunner/script_check_hook.go +++ b/client/allocrunner/taskrunner/script_check_hook.go @@ -204,6 +204,10 @@ func (h *scriptCheckHook) newScriptChecks() map[string]*scriptCheck { // for them. The group-level service and any check restart behaviors it // needs are entirely encapsulated within the group service hook which // watches Consul for status changes. + // + // The script check is associated with a group task if the service.task or + // service.check.task matches the task name. The service.check.task takes + // precedence. tg := h.alloc.Job.LookupTaskGroup(h.alloc.TaskGroup) interpolatedGroupServices := taskenv.InterpolateServices(h.taskEnv, tg.Services) for _, service := range interpolatedGroupServices { @@ -211,7 +215,7 @@ func (h *scriptCheckHook) newScriptChecks() map[string]*scriptCheck { if check.Type != structs.ServiceCheckScript { continue } - if check.TaskName != h.task.Name { + if !h.associated(h.task.Name, service.TaskName, check.TaskName) { continue } groupTaskName := "group-" + tg.Name @@ -237,6 +241,20 @@ func (h *scriptCheckHook) newScriptChecks() map[string]*scriptCheck { return scriptChecks } +// associated returns true if the script check is associated with the task. This +// would be the case if the check.task is the same as task, or if the service.task +// is the same as the task _and_ check.task is not configured (i.e. the check +// inherits the task of the service). +func (*scriptCheckHook) associated(task, serviceTask, checkTask string) bool { + if checkTask == task { + return true + } + if serviceTask == task && checkTask == "" { + return true + } + return false +} + // heartbeater is the subset of consul agent functionality needed by script // checks to heartbeat type heartbeater interface { @@ -371,7 +389,7 @@ const ( updateTTLBackoffLimit = 3 * time.Second ) -// updateTTL updates the state to Consul, performing an expontential backoff +// updateTTL updates the state to Consul, performing an exponential backoff // in the case where the check isn't registered in Consul to avoid a race between // service registration and the first check. func (s *scriptCheck) updateTTL(ctx context.Context, msg, state string) error { diff --git a/client/allocrunner/taskrunner/script_check_hook_test.go b/client/allocrunner/taskrunner/script_check_hook_test.go index 9fba3a67f66..73ea2050e9b 100644 --- a/client/allocrunner/taskrunner/script_check_hook_test.go +++ b/client/allocrunner/taskrunner/script_check_hook_test.go @@ -286,3 +286,27 @@ func TestScript_TaskEnvInterpolation(t *testing.T) { require.True(t, ok) require.Equal(t, "my-job-backend-check", check.check.Name) } + +func TestScript_associated(t *testing.T) { + t.Run("neither set", func(t *testing.T) { + require.False(t, new(scriptCheckHook).associated("task1", "", "")) + }) + + t.Run("service set", func(t *testing.T) { + require.True(t, new(scriptCheckHook).associated("task1", "task1", "")) + require.False(t, new(scriptCheckHook).associated("task1", "task2", "")) + }) + + t.Run("check set", func(t *testing.T) { + require.True(t, new(scriptCheckHook).associated("task1", "", "task1")) + require.False(t, new(scriptCheckHook).associated("task1", "", "task2")) + }) + + t.Run("both set", func(t *testing.T) { + // ensure check.task takes precedence over service.task + require.True(t, new(scriptCheckHook).associated("task1", "task1", "task1")) + require.False(t, new(scriptCheckHook).associated("task1", "task1", "task2")) + require.True(t, new(scriptCheckHook).associated("task1", "task2", "task1")) + require.False(t, new(scriptCheckHook).associated("task1", "task2", "task2")) + }) +} diff --git a/nomad/job_endpoint_test.go b/nomad/job_endpoint_test.go index 7ff57ccd06c..c80fa533f27 100644 --- a/nomad/job_endpoint_test.go +++ b/nomad/job_endpoint_test.go @@ -478,6 +478,7 @@ func TestJobEndpoint_Register_ConnectExposeCheck(t *testing.T) { Name: "check2", Type: "script", Command: "/bin/true", + TaskName: "web", Interval: 1 * time.Second, Timeout: 1 * time.Second, }, { diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 340f2e36c37..1fcd57f05ce 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -5801,6 +5801,12 @@ func (tg *TaskGroup) Validate(j *Job) error { mErr.Errors = append(mErr.Errors, outer) } + // Validate group service script-checks + if err := tg.validateScriptChecksInGroupServices(); err != nil { + outer := fmt.Errorf("Task group service check validation failed: %v", err) + mErr.Errors = append(mErr.Errors, outer) + } + // Validate the scaling policy if err := tg.validateScalingPolicy(j); err != nil { outer := fmt.Errorf("Task group scaling policy validation failed: %v", err) @@ -5950,6 +5956,26 @@ func (tg *TaskGroup) validateServices() error { return mErr.ErrorOrNil() } +// validateScriptChecksInGroupServices ensures group-level services with script +// checks know what task driver to use. Either the service.task or service.check.task +// parameter must be configured. +func (tg *TaskGroup) validateScriptChecksInGroupServices() error { + var mErr multierror.Error + for _, service := range tg.Services { + if service.TaskName == "" { + for _, check := range service.Checks { + if check.Type == "script" && check.TaskName == "" { + mErr.Errors = append(mErr.Errors, + fmt.Errorf("Service [%s]->%s or Check %s must specify task parameter", + tg.Name, service.Name, check.Name, + )) + } + } + } + } + return mErr.ErrorOrNil() +} + // validateScalingPolicy ensures that the scaling policy has consistent // min and max, not in conflict with the task group count func (tg *TaskGroup) validateScalingPolicy(j *Job) error { diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go index 9ea2c8636c0..2a0cc00c7e1 100644 --- a/nomad/structs/structs_test.go +++ b/nomad/structs/structs_test.go @@ -5584,3 +5584,69 @@ func TestAllocatedSharedResources_Canonicalize(t *testing.T) { }, }, a.Ports) } + +func TestTaskGroup_validateScriptChecksInGroupServices(t *testing.T) { + t.Run("service task not set", func(t *testing.T) { + tg := &TaskGroup{ + Name: "group1", + Services: []*Service{{ + Name: "service1", + TaskName: "", // unset + Checks: []*ServiceCheck{{ + Name: "check1", + Type: "script", + TaskName: "", // unset + }, { + Name: "check2", + Type: "ttl", // not script + }, { + Name: "check3", + Type: "script", + TaskName: "", // unset + }}, + }, { + Name: "service2", + Checks: []*ServiceCheck{{ + Type: "script", + TaskName: "task1", // set + }}, + }, { + Name: "service3", + TaskName: "", // unset + Checks: []*ServiceCheck{{ + Name: "check1", + Type: "script", + TaskName: "", // unset + }}, + }}, + } + + errStr := tg.validateScriptChecksInGroupServices().Error() + require.Contains(t, errStr, "Service [group1]->service1 or Check check1 must specify task parameter") + require.Contains(t, errStr, "Service [group1]->service1 or Check check3 must specify task parameter") + require.Contains(t, errStr, "Service [group1]->service3 or Check check1 must specify task parameter") + }) + + t.Run("service task set", func(t *testing.T) { + tgOK := &TaskGroup{ + Name: "group1", + Services: []*Service{{ + Name: "service1", + TaskName: "task1", + Checks: []*ServiceCheck{{ + Name: "check1", + Type: "script", + }, { + Name: "check2", + Type: "ttl", + }, { + Name: "check3", + Type: "script", + }}, + }}, + } + + mErrOK := tgOK.validateScriptChecksInGroupServices() + require.Nil(t, mErrOK) + }) +} diff --git a/website/pages/docs/job-specification/service.mdx b/website/pages/docs/job-specification/service.mdx index 634f3ef6c96..716e11215d8 100644 --- a/website/pages/docs/job-specification/service.mdx +++ b/website/pages/docs/job-specification/service.mdx @@ -246,7 +246,8 @@ scripts. - `task` `(string: )` - Specifies the task associated with this check. Scripts are executed within the task's environment, and `check_restart` stanzas will apply to the specified task. For `checks` on group - level `services` only. + level `services` only. Inherits the [`service.task`][service_task] value if not + set. - `timeout` `(string: )` - Specifies how long Consul will wait for a health check query to succeed. This is specified using a label suffix like @@ -714,3 +715,4 @@ advertise and check directly since Nomad isn't managing any port assignments. [shutdowndelay]: /docs/job-specification/task#shutdown_delay [killsignal]: /docs/job-specification/task#kill_signal [killtimeout]: /docs/job-specification/task#kill_timeout +[service_task]: /docs/job-specification/service#task-1