From 25569282b9b0ed6aeafdfeedc7469ad092db94f5 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Tue, 5 Dec 2017 11:39:42 -0800 Subject: [PATCH 01/16] Allow custom ports for services and checks Fixes #3380 Adds address_mode to checks (but no auto) and allows services and checks to set literal port numbers when using address_mode=driver. This allows SDNs, overlays, etc to advertise internal and host addresses as well as do checks against either. --- api/tasks.go | 1 + command/agent/consul/client.go | 91 +++++++++--- command/agent/consul/unit_test.go | 139 ++++++++++++++++++ command/agent/job_endpoint.go | 1 + command/agent/job_endpoint_test.go | 2 + jobspec/parse.go | 1 + jobspec/parse_test.go | 48 ++++++ .../service-check-driver-address.hcl | 38 +++++ nomad/structs/structs.go | 27 +++- 9 files changed, 323 insertions(+), 25 deletions(-) create mode 100644 jobspec/test-fixtures/service-check-driver-address.hcl diff --git a/api/tasks.go b/api/tasks.go index 462d9f54931..7dc2950b187 100644 --- a/api/tasks.go +++ b/api/tasks.go @@ -154,6 +154,7 @@ type ServiceCheck struct { Path string Protocol string PortLabel string `mapstructure:"port"` + AddressMode string `mapstructure:"address_mode"` Interval time.Duration Timeout time.Duration InitialStatus string `mapstructure:"initial_status"` diff --git a/command/agent/consul/client.go b/command/agent/consul/client.go index 57912b35bf9..574e89dd94f 100644 --- a/command/agent/consul/client.go +++ b/command/agent/consul/client.go @@ -569,23 +569,16 @@ func (c *ServiceClient) serviceRegs(ops *operations, allocID string, service *st checkIDs: make(map[string]struct{}, len(service.Checks)), } - // Determine the address to advertise + // Service address modes default to auto addrMode := service.AddressMode - if addrMode == structs.AddressModeAuto { - if net.Advertise() { - addrMode = structs.AddressModeDriver - } else { - // No driver network or shouldn't default to driver's network - addrMode = structs.AddressModeHost - } + if addrMode == "" { + addrMode = structs.AddressModeAuto } - ip, port := task.Resources.Networks.Port(service.PortLabel) - if addrMode == structs.AddressModeDriver { - if net == nil { - return nil, fmt.Errorf("service %s cannot use driver's IP because driver didn't set one", service.Name) - } - ip = net.IP - port = net.PortMap[service.PortLabel] + + // Determine the address to advertise based on the mode + ip, port, err := getAddress(addrMode, service.PortLabel, task.Resources.Networks, net) + if err != nil { + return nil, fmt.Errorf("unable to get address for service %q: %v", service.Name, err) } // Build the Consul Service registration request @@ -641,13 +634,24 @@ func (c *ServiceClient) checkRegs(ops *operations, allocID, serviceID string, se } - // Checks should always use the host ip:port + // Default to the service's port but allow check to override portLabel := check.PortLabel if portLabel == "" { // Default to the service's port label portLabel = service.PortLabel } - ip, port := task.Resources.Networks.Port(portLabel) + + // Checks address mode defaults to host for pre-#3380 backward compat + addrMode := check.AddressMode + if addrMode == "" { + addrMode = structs.AddressModeHost + } + + ip, port, err := getAddress(addrMode, portLabel, task.Resources.Networks, net) + if err != nil { + return nil, fmt.Errorf("unable to get address for check %q: %v", check.Name, err) + } + checkReg, err := createCheckReg(serviceID, checkID, check, ip, port) if err != nil { return nil, fmt.Errorf("failed to add check %q: %v", check.Name, err) @@ -709,8 +713,8 @@ func (c *ServiceClient) RegisterTask(allocID string, task *structs.Task, restart func (c *ServiceClient) UpdateTask(allocID string, existing, newTask *structs.Task, restarter TaskRestarter, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error { ops := &operations{} - t := new(TaskRegistration) - t.Services = make(map[string]*ServiceRegistration, len(newTask.Services)) + taskReg := new(TaskRegistration) + taskReg.Services = make(map[string]*ServiceRegistration, len(newTask.Services)) existingIDs := make(map[string]*structs.Service, len(existing.Services)) for _, s := range existing.Services { @@ -745,7 +749,7 @@ func (c *ServiceClient) UpdateTask(allocID string, existing, newTask *structs.Ta serviceID: existingID, checkIDs: make(map[string]struct{}, len(newSvc.Checks)), } - t.Services[existingID] = sreg + taskReg.Services[existingID] = sreg // PortLabel and AddressMode aren't included in the ID, so we // have to compare manually. @@ -755,7 +759,7 @@ func (c *ServiceClient) UpdateTask(allocID string, existing, newTask *structs.Ta delete(newIDs, existingID) } - // Check to see what checks were updated + // See what checks were updated existingChecks := make(map[string]*structs.ServiceCheck, len(existingSvc.Checks)) for _, check := range existingSvc.Checks { existingChecks[makeCheckID(existingID, check)] = check @@ -806,11 +810,11 @@ func (c *ServiceClient) UpdateTask(allocID string, existing, newTask *structs.Ta return err } - t.Services[sreg.serviceID] = sreg + taskReg.Services[sreg.serviceID] = sreg } // Add the task to the allocation's registration - c.addTaskRegistration(allocID, newTask.Name, t) + c.addTaskRegistration(allocID, newTask.Name, taskReg) c.commit(ops) @@ -1079,3 +1083,44 @@ func isNomadService(id string) bool { const prefix = nomadServicePrefix + "-executor" return strings.HasPrefix(id, prefix) } + +// getAddress returns the ip and port to use for a service or check. An error +// is returned if an ip and port cannot be determined. +func getAddress(addrMode, portLabel string, networks structs.Networks, driverNet *cstructs.DriverNetwork) (string, int, error) { + switch addrMode { + case structs.AddressModeAuto: + if driverNet.Advertise() { + addrMode = structs.AddressModeDriver + } else { + addrMode = structs.AddressModeHost + } + return getAddress(addrMode, portLabel, networks, driverNet) + case structs.AddressModeHost: + // Default path: use host ip:port + ip, port := networks.Port(portLabel) + return ip, port, nil + + case structs.AddressModeDriver: + // Require a driver network if driver address mode is used + if driverNet == nil { + return "", 0, fmt.Errorf(`cannot use address_mode="driver": no driver network exists`) + } + + // If the port is a label, use the driver's port (not the host's) + if port, ok := driverNet.PortMap[portLabel]; ok { + return driverNet.IP, port, nil + } + + // If port isn't a label, try to parse it as a literal port number + port, err := strconv.Atoi(portLabel) + if err != nil { + return "", 0, fmt.Errorf("invalid port %q: %v", portLabel, err) + } + + return driverNet.IP, port, nil + + default: + // Shouldn't happen due to validation, but enforce invariants + return "", 0, fmt.Errorf("invalid address mode %q", addrMode) + } +} diff --git a/command/agent/consul/unit_test.go b/command/agent/consul/unit_test.go index 8bd3e08a5d8..73d82bd1205 100644 --- a/command/agent/consul/unit_test.go +++ b/command/agent/consul/unit_test.go @@ -16,6 +16,7 @@ import ( cstructs "github.com/hashicorp/nomad/client/structs" "github.com/hashicorp/nomad/nomad/structs" "github.com/kr/pretty" + "github.com/stretchr/testify/assert" ) const ( @@ -1440,3 +1441,141 @@ func TestCreateCheckReg(t *testing.T) { t.Fatalf("diff:\n%s\n", strings.Join(diff, "\n")) } } + +// TestGetAddress asserts Nomad uses the correct ip and port for services and +// checks depending on port labels, driver networks, and address mode. +func TestGetAddress(t *testing.T) { + const HostIP = "127.0.0.1" + + cases := []struct { + Name string + + // Parameters + Mode string + PortLabel string + Host map[string]int // will be converted to structs.Networks + Driver *cstructs.DriverNetwork + + // Results + IP string + Port int + ErrContains string + }{ + { + Name: "ExampleService", + Mode: structs.AddressModeAuto, + PortLabel: "db", + Host: map[string]int{"db": 12435}, + Driver: &cstructs.DriverNetwork{ + PortMap: map[string]int{"db": 6379}, + IP: "10.1.2.3", + }, + IP: HostIP, + Port: 12435, + }, + { + Name: "Host", + Mode: structs.AddressModeHost, + PortLabel: "db", + Host: map[string]int{"db": 12345}, + Driver: &cstructs.DriverNetwork{ + PortMap: map[string]int{"db": 6379}, + IP: "10.1.2.3", + }, + IP: HostIP, + Port: 12345, + }, + { + Name: "Driver", + Mode: structs.AddressModeDriver, + PortLabel: "db", + Host: map[string]int{"db": 12345}, + Driver: &cstructs.DriverNetwork{ + PortMap: map[string]int{"db": 6379}, + IP: "10.1.2.3", + }, + IP: "10.1.2.3", + Port: 6379, + }, + { + Name: "AutoDriver", + Mode: structs.AddressModeAuto, + PortLabel: "db", + Host: map[string]int{"db": 12345}, + Driver: &cstructs.DriverNetwork{ + PortMap: map[string]int{"db": 6379}, + IP: "10.1.2.3", + AutoAdvertise: true, + }, + IP: "10.1.2.3", + Port: 6379, + }, + { + Name: "DriverCustomPort", + Mode: structs.AddressModeDriver, + PortLabel: "7890", + Host: map[string]int{"db": 12345}, + Driver: &cstructs.DriverNetwork{ + PortMap: map[string]int{"db": 6379}, + IP: "10.1.2.3", + }, + IP: "10.1.2.3", + Port: 7890, + }, + { + Name: "DriverWithoutNetwork", + Mode: structs.AddressModeDriver, + PortLabel: "db", + Host: map[string]int{"db": 12345}, + Driver: nil, + ErrContains: "no driver network exists", + }, + { + Name: "DriverBadPort", + Mode: structs.AddressModeDriver, + PortLabel: "bad-port-label", + Host: map[string]int{"db": 12345}, + Driver: &cstructs.DriverNetwork{ + PortMap: map[string]int{"db": 6379}, + IP: "10.1.2.3", + }, + ErrContains: "invalid port", + }, + { + Name: "InvalidMode", + Mode: "invalid-mode", + ErrContains: "invalid address mode", + }, + } + + for _, tc := range cases { + t.Run(tc.Name, func(t *testing.T) { + // convert host port map into a structs.Networks + networks := []*structs.NetworkResource{ + { + IP: HostIP, + ReservedPorts: make([]structs.Port, len(tc.Host)), + }, + } + + i := 0 + for label, port := range tc.Host { + networks[0].ReservedPorts[i].Label = label + networks[0].ReservedPorts[i].Value = port + i++ + } + + // Run getAddress + ip, port, err := getAddress(tc.Mode, tc.PortLabel, networks, tc.Driver) + + // Assert the results + assert.Equal(t, tc.IP, ip, "IP mismatch") + assert.Equal(t, tc.Port, port, "Port mismatch") + if tc.ErrContains == "" { + assert.Nil(t, err) + } else { + assert.Contains(t, err.Error(), tc.ErrContains) + } + }) + } +} diff --git a/command/agent/job_endpoint.go b/command/agent/job_endpoint.go index 9a154679e95..6b0e3a56548 100644 --- a/command/agent/job_endpoint.go +++ b/command/agent/job_endpoint.go @@ -710,6 +710,7 @@ func ApiTaskToStructsTask(apiTask *api.Task, structsTask *structs.Task) { Path: check.Path, Protocol: check.Protocol, PortLabel: check.PortLabel, + AddressMode: check.AddressMode, Interval: check.Interval, Timeout: check.Timeout, InitialStatus: check.InitialStatus, diff --git a/command/agent/job_endpoint_test.go b/command/agent/job_endpoint_test.go index 8eb74a9b92b..019a82ae059 100644 --- a/command/agent/job_endpoint_test.go +++ b/command/agent/job_endpoint_test.go @@ -1222,6 +1222,7 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) { Path: "/check", Protocol: "http", PortLabel: "foo", + AddressMode: "driver", Interval: 4 * time.Second, Timeout: 2 * time.Second, InitialStatus: "ok", @@ -1418,6 +1419,7 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) { Path: "/check", Protocol: "http", PortLabel: "foo", + AddressMode: "driver", Interval: 4 * time.Second, Timeout: 2 * time.Second, InitialStatus: "ok", diff --git a/jobspec/parse.go b/jobspec/parse.go index f44cb75406f..d25f38bd0f8 100644 --- a/jobspec/parse.go +++ b/jobspec/parse.go @@ -981,6 +981,7 @@ func parseChecks(service *api.Service, checkObjs *ast.ObjectList) error { "header", "method", "check_restart", + "address_mode", } if err := helper.CheckHCLKeys(co.Val, valid); err != nil { return multierror.Prefix(err, "check ->") diff --git a/jobspec/parse_test.go b/jobspec/parse_test.go index 5922edeff51..2dfc890d42f 100644 --- a/jobspec/parse_test.go +++ b/jobspec/parse_test.go @@ -583,6 +583,54 @@ func TestParse(t *testing.T) { }, false, }, + { + "service-check-driver-address.hcl", + &api.Job{ + ID: helper.StringToPtr("address_mode_driver"), + Name: helper.StringToPtr("address_mode_driver"), + Type: helper.StringToPtr("service"), + TaskGroups: []*api.TaskGroup{ + { + Name: helper.StringToPtr("group"), + Tasks: []*api.Task{ + { + Name: "task", + Services: []*api.Service{ + { + Name: "http-service", + PortLabel: "http", + AddressMode: "auto", + Checks: []api.ServiceCheck{ + { + Name: "http-check", + Type: "http", + Path: "/", + PortLabel: "http", + AddressMode: "driver", + }, + }, + }, + { + Name: "random-service", + PortLabel: "9000", + AddressMode: "driver", + Checks: []api.ServiceCheck{ + { + Name: "random-check", + Type: "tcp", + PortLabel: "9001", + AddressMode: "driver", + }, + }, + }, + }, + }, + }, + }, + }, + }, + false, + }, } for _, tc := range cases { diff --git a/jobspec/test-fixtures/service-check-driver-address.hcl b/jobspec/test-fixtures/service-check-driver-address.hcl new file mode 100644 index 00000000000..a9fa88d422c --- /dev/null +++ b/jobspec/test-fixtures/service-check-driver-address.hcl @@ -0,0 +1,38 @@ +job "address_mode_driver" { + type = "service" + group "group" { + task "task" { + service { + name = "http-service" + port = "http" + + address_mode = "auto" + + check { + name = "http-check" + type = "http" + path = "/" + port = "http" + + address_mode = "driver" + } + } + + service { + name = "random-service" + port = "9000" + + address_mode = "driver" + + check { + name = "random-check" + type = "tcp" + port = "9001" + + address_mode = "driver" + } + } + } + } +} + diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 1836aae828f..fa29071aa1c 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -2866,6 +2866,7 @@ type ServiceCheck struct { Path string // path of the health check url for http type check Protocol string // Protocol to use if check is http, defaults to http PortLabel string // The port to use for tcp/http checks + AddressMode string // 'host' to use host ip:port or 'driver' to use driver's Interval time.Duration // Interval of the check Timeout time.Duration // Timeout of the response from the check before consul fails the check InitialStatus string // Initial status of the check @@ -2911,6 +2912,7 @@ func (sc *ServiceCheck) Canonicalize(serviceName string) { // validate a Service's ServiceCheck func (sc *ServiceCheck) validate() error { + // Validate Type switch strings.ToLower(sc.Type) { case ServiceCheckTCP: case ServiceCheckHTTP: @@ -2926,6 +2928,7 @@ func (sc *ServiceCheck) validate() error { return fmt.Errorf(`invalid type (%+q), must be one of "http", "tcp", or "script" type`, sc.Type) } + // Validate interval and timeout if sc.Interval == 0 { return fmt.Errorf("missing required value interval. Interval cannot be less than %v", minCheckInterval) } else if sc.Interval < minCheckInterval { @@ -2938,9 +2941,9 @@ func (sc *ServiceCheck) validate() error { return fmt.Errorf("timeout (%v) is lower than required minimum timeout %v", sc.Timeout, minCheckInterval) } + // Validate InitialStatus switch sc.InitialStatus { case "": - // case api.HealthUnknown: TODO: Add when Consul releases 0.7.1 case api.HealthPassing: case api.HealthWarning: case api.HealthCritical: @@ -2949,6 +2952,16 @@ func (sc *ServiceCheck) validate() error { } + // Validate AddressMode + switch sc.AddressMode { + case "", AddressModeHost, AddressModeDriver: + // Ok + case AddressModeAuto: + return fmt.Errorf("invalid address_mode %q - %s only valid for services", sc.AddressMode, AddressModeAuto) + default: + return fmt.Errorf("invalid address_mode %q", sc.AddressMode) + } + return sc.CheckRestart.Validate() } @@ -3001,6 +3014,11 @@ func (sc *ServiceCheck) Hash(serviceID string) string { io.WriteString(h, strings.Join(headers, "")) } + // Only include AddressMode if set to maintain ID stability with Nomad <0.7.1 + if len(sc.AddressMode) > 0 { + io.WriteString(h, sc.AddressMode) + } + return fmt.Sprintf("%x", h.Sum(nil)) } @@ -3455,7 +3473,12 @@ func validateServices(t *Task) error { knownServices[service.Name+service.PortLabel] = struct{}{} if service.PortLabel != "" { - servicePorts[service.PortLabel] = append(servicePorts[service.PortLabel], service.Name) + if _, err := strconv.Atoi(service.PortLabel); service.AddressMode == "driver" && err == nil { + // Numeric ports are valid when AddressMode=driver + } else { + + servicePorts[service.PortLabel] = append(servicePorts[service.PortLabel], service.Name) + } } // Ensure that check names are unique. From 74e7d57b75587819f822a5d8f99b640f2020dc11 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Wed, 6 Dec 2017 15:40:04 -0800 Subject: [PATCH 02/16] Test Consul from TaskRunner thoroughly Rely less on the mockConsulServiceClient because the real consul.ServiceClient needs all the testing it can get! --- client/driver/mock_driver.go | 53 +++++- client/task_runner_test.go | 243 +++++++++++++++++++----- command/agent/consul/catalog_testing.go | 14 ++ 3 files changed, 255 insertions(+), 55 deletions(-) diff --git a/client/driver/mock_driver.go b/client/driver/mock_driver.go index 7db3072b5fc..49279485426 100644 --- a/client/driver/mock_driver.go +++ b/client/driver/mock_driver.go @@ -10,6 +10,7 @@ import ( "log" "os" "strconv" + "strings" "time" "github.com/mitchellh/mapstructure" @@ -61,6 +62,17 @@ type MockDriverConfig struct { // SignalErr is the error message that the task returns if signalled SignalErr string `mapstructure:"signal_error"` + + // DriverIP will be returned as the DriverNetwork.IP from Start() + DriverIP string `mapstructure:"driver_ip"` + + // DriverAdvertise will be returned as DriverNetwork.AutoAdvertise from + // Start(). + DriverAdvertise bool `mapstructure:"driver_advertise"` + + // DriverPortMap will parse a label:number pair and return it in + // DriverNetwork.PortMap from Start(). + DriverPortMap string `mapstructure:"driver_port_map"` } // MockDriver is a driver which is used for testing purposes @@ -114,6 +126,23 @@ func (m *MockDriver) Start(ctx *ExecContext, task *structs.Task) (*StartResponse return nil, structs.NewRecoverableError(errors.New(driverConfig.StartErr), driverConfig.StartErrRecoverable) } + // Create the driver network + net := &cstructs.DriverNetwork{ + IP: driverConfig.DriverIP, + AutoAdvertise: driverConfig.DriverAdvertise, + } + if raw := driverConfig.DriverPortMap; len(raw) > 0 { + parts := strings.Split(raw, ":") + if len(parts) != 2 { + return nil, fmt.Errorf("malformed port map: %q", raw) + } + port, err := strconv.Atoi(parts[1]) + if err != nil { + return nil, fmt.Errorf("malformed port map: %q -- error: %v", raw, err) + } + net.PortMap = map[string]int{parts[0]: port} + } + h := mockDriverHandle{ taskName: task.Name, runFor: driverConfig.RunFor, @@ -133,7 +162,8 @@ func (m *MockDriver) Start(ctx *ExecContext, task *structs.Task) (*StartResponse } m.logger.Printf("[DEBUG] driver.mock: starting task %q", task.Name) go h.run() - return &StartResponse{Handle: &h}, nil + + return &StartResponse{Handle: &h, Network: net}, nil } // Cleanup deletes all keys except for Config.Options["cleanup_fail_on"] for @@ -265,10 +295,20 @@ func (h *mockDriverHandle) Kill() error { select { case <-h.doneCh: case <-time.After(h.killAfter): - close(h.doneCh) + select { + case <-h.doneCh: + // already closed + default: + close(h.doneCh) + } case <-time.After(h.killTimeout): h.logger.Printf("[DEBUG] driver.mock: terminating task %q", h.taskName) - close(h.doneCh) + select { + case <-h.doneCh: + // already closed + default: + close(h.doneCh) + } } return nil } @@ -286,7 +326,12 @@ func (h *mockDriverHandle) run() { for { select { case <-timer.C: - close(h.doneCh) + select { + case <-h.doneCh: + // already closed + default: + close(h.doneCh) + } case <-h.doneCh: h.logger.Printf("[DEBUG] driver.mock: finished running task %q", h.taskName) h.waitCh <- dstructs.NewWaitResult(h.exitCode, h.exitSignal, h.exitErr) diff --git a/client/task_runner_test.go b/client/task_runner_test.go index 5b99a1ca07b..8221705071d 100644 --- a/client/task_runner_test.go +++ b/client/task_runner_test.go @@ -73,15 +73,17 @@ func (m *MockTaskStateUpdater) String() string { } type taskRunnerTestCtx struct { - upd *MockTaskStateUpdater - tr *TaskRunner - allocDir *allocdir.AllocDir - vault *vaultclient.MockVaultClient - consul *mockConsulServiceClient + upd *MockTaskStateUpdater + tr *TaskRunner + allocDir *allocdir.AllocDir + vault *vaultclient.MockVaultClient + consul *consul.MockAgent + consulClient *consul.ServiceClient } // Cleanup calls Destroy on the task runner and alloc dir func (ctx *taskRunnerTestCtx) Cleanup() { + ctx.consulClient.Shutdown() ctx.tr.Destroy(structs.NewTaskEvent(structs.TaskKilled)) ctx.allocDir.Destroy() } @@ -143,17 +145,20 @@ func testTaskRunnerFromAlloc(t *testing.T, restarts bool, alloc *structs.Allocat } vclient := vaultclient.NewMockVaultClient() - cclient := newMockConsulServiceClient() - tr := NewTaskRunner(logger, conf, db, upd.Update, taskDir, alloc, task, vclient, cclient) + cclient := consul.NewMockAgent() + serviceClient := consul.NewServiceClient(cclient, true, logger) + go serviceClient.Run() + tr := NewTaskRunner(logger, conf, db, upd.Update, taskDir, alloc, task, vclient, serviceClient) if !restarts { tr.restartTracker = noRestartsTracker() } return &taskRunnerTestCtx{ - upd: upd, - tr: tr, - allocDir: allocDir, - vault: vclient, - consul: cclient, + upd: upd, + tr: tr, + allocDir: allocDir, + vault: vclient, + consul: cclient, + consulClient: serviceClient, } } @@ -339,7 +344,12 @@ func TestTaskRunner_Update(t *testing.T) { t.Parallel() alloc := mock.Alloc() task := alloc.Job.TaskGroups[0].Tasks[0] - task.Services[0].Checks[0].Args[0] = "${NOMAD_META_foo}" + task.Services[0].Checks[0] = &structs.ServiceCheck{ + Name: "http-check", + Type: "http", + PortLabel: "web", + Path: "${NOMAD_META_foo}", + } task.Driver = "mock_driver" task.Config = map[string]interface{}{ "run_for": "100s", @@ -350,6 +360,8 @@ func TestTaskRunner_Update(t *testing.T) { go ctx.tr.Run() defer ctx.Cleanup() + testWaitForTaskToStart(t, ctx) + // Update the task definition updateAlloc := ctx.tr.alloc.Copy() @@ -363,10 +375,9 @@ func TestTaskRunner_Update(t *testing.T) { // Update meta to make sure service checks are interpolated correctly // #2180 - newTask.Meta["foo"] = "UPDATE" + newTask.Meta["foo"] = "/UPDATE" // Update the kill timeout - testWaitForTaskToStart(t, ctx) oldHandle := ctx.tr.handle.ID() newTask.KillTimeout = time.Hour ctx.tr.Update(updateAlloc) @@ -380,25 +391,22 @@ func TestTaskRunner_Update(t *testing.T) { return false, fmt.Errorf("Task not copied") } if ctx.tr.restartTracker.policy.Mode != newMode { - return false, fmt.Errorf("restart policy not ctx.updated") + return false, fmt.Errorf("expected restart policy %q but found %q", newMode, ctx.tr.restartTracker.policy.Mode) } if ctx.tr.handle.ID() == oldHandle { return false, fmt.Errorf("handle not ctx.updated") } + // Make sure Consul services were interpolated correctly during // the update #2180 - consul := ctx.tr.consul.(*mockConsulServiceClient) - consul.mu.Lock() - defer consul.mu.Unlock() - if len(consul.ops) < 2 { - return false, fmt.Errorf("expected at least 2 consul ops found: %d", len(consul.ops)) + checks := ctx.consul.CheckRegs() + if n := len(checks); n != 1 { + return false, fmt.Errorf("expected 1 check but found %d", n) } - lastOp := consul.ops[len(consul.ops)-1] - if lastOp.op != "update" { - return false, fmt.Errorf("expected last consul op to be update not %q", lastOp.op) - } - if found := lastOp.task.Services[0].Checks[0].Args[0]; found != "UPDATE" { - return false, fmt.Errorf("expected consul check to be UPDATE but found: %q", found) + for _, check := range checks { + if found := check.HTTP; !strings.HasSuffix(found, "/UPDATE") { + return false, fmt.Errorf("expected consul check path to end with /UPDATE but found: %q", found) + } } return true, nil }, func(err error) { @@ -635,12 +643,16 @@ func TestTaskRunner_UnregisterConsul_Retries(t *testing.T) { } ctx := testTaskRunnerFromAlloc(t, true, alloc) + + // Use mockConsulServiceClient + consul := newMockConsulServiceClient() + ctx.tr.consul = consul + ctx.tr.MarkReceived() ctx.tr.Run() defer ctx.Cleanup() // Assert it is properly registered and unregistered - consul := ctx.tr.consul.(*mockConsulServiceClient) if expected := 4; len(consul.ops) != expected { t.Errorf("expected %d consul ops but found: %d", expected, len(consul.ops)) } @@ -1742,6 +1754,8 @@ func TestTaskRunner_ShutdownDelay(t *testing.T) { alloc := mock.Alloc() task := alloc.Job.TaskGroups[0].Tasks[0] + task.Services[0].Tags = []string{"tag1"} + task.Services = task.Services[:1] // only need 1 for this test task.Driver = "mock_driver" task.Config = map[string]interface{}{ "run_for": "1000s", @@ -1758,34 +1772,39 @@ func TestTaskRunner_ShutdownDelay(t *testing.T) { // Wait for the task to start testWaitForTaskToStart(t, ctx) + testutil.WaitForResult(func() (bool, error) { + services, _ := ctx.consul.Services() + if n := len(services); n != 1 { + return false, fmt.Errorf("expected 1 service found %d", n) + } + for _, s := range services { + if !reflect.DeepEqual(s.Tags, task.Services[0].Tags) { + return false, fmt.Errorf("expected tags=%q but found %q", + strings.Join(task.Services[0].Tags, ","), strings.Join(s.Tags, ",")) + } + } + return true, nil + }, func(err error) { + services, _ := ctx.consul.Services() + for _, s := range services { + t.Logf("Service: %#v", s) + } + t.Fatalf("err: %v", err) + }) + // Begin the tear down ctx.tr.Destroy(structs.NewTaskEvent(structs.TaskKilled)) destroyed := time.Now() - // Service should get removed quickly; loop until RemoveTask is called - found := false - deadline := destroyed.Add(task.ShutdownDelay) - for time.Now().Before(deadline) { - time.Sleep(5 * time.Millisecond) - - ctx.consul.mu.Lock() - n := len(ctx.consul.ops) - if n < 2 { - ctx.consul.mu.Unlock() - continue - } - - lastOp := ctx.consul.ops[n-1].op - ctx.consul.mu.Unlock() - - if lastOp == "remove" { - found = true - break + testutil.WaitForResult(func() (bool, error) { + services, _ := ctx.consul.Services() + if n := len(services); n == 1 { + return false, fmt.Errorf("expected 0 services found %d", n) } - } - if !found { - t.Errorf("task was not removed from Consul first") - } + return true, nil + }, func(err error) { + t.Fatalf("err: %v", err) + }) // Wait for actual exit select { @@ -1893,3 +1912,125 @@ func TestTaskRunner_CheckWatcher_Restart(t *testing.T) { } } } + +// TestTaskRunner_DriverNetwork asserts that a driver's network is properly +// used in services and checks. +func TestTaskRunner_DriverNetwork(t *testing.T) { + t.Parallel() + + alloc := mock.Alloc() + task := alloc.Job.TaskGroups[0].Tasks[0] + task.Driver = "mock_driver" + task.Config = map[string]interface{}{ + "exit_code": 0, + "run_for": "100s", + "driver_ip": "10.1.2.3", + "driver_port_map": "http:80", + } + + // Create services and checks with custom address modes to exercise + // address detection logic + task.Services = []*structs.Service{ + { + Name: "host-service", + PortLabel: "http", + AddressMode: "host", + Checks: []*structs.ServiceCheck{ + { + Name: "driver-check", + Type: "tcp", + PortLabel: "1234", + AddressMode: "driver", + }, + }, + }, + { + Name: "driver-service", + PortLabel: "5678", + AddressMode: "driver", + Checks: []*structs.ServiceCheck{ + { + Name: "host-check", + Type: "tcp", + PortLabel: "http", + }, + { + Name: "driver-label-check", + Type: "tcp", + PortLabel: "http", + AddressMode: "driver", + }, + }, + }, + } + + ctx := testTaskRunnerFromAlloc(t, false, alloc) + ctx.tr.MarkReceived() + go ctx.tr.Run() + defer ctx.Cleanup() + + // Wait for the task to start + testWaitForTaskToStart(t, ctx) + + testutil.WaitForResult(func() (bool, error) { + services, _ := ctx.consul.Services() + if n := len(services); n != 2 { + return false, fmt.Errorf("expected 2 services, but found %d", n) + } + for _, s := range services { + switch s.Service { + case "host-service": + if expected := "192.168.0.100"; s.Address != expected { + return false, fmt.Errorf("expected host-service to have IP=%s but found %s", + expected, s.Address) + } + case "driver-service": + if expected := "10.1.2.3"; s.Address != expected { + return false, fmt.Errorf("expected driver-service to have IP=%s but found %s", + expected, s.Address) + } + if expected := 5678; s.Port != expected { + return false, fmt.Errorf("expected driver-service to have port=%d but found %d", + expected, s.Port) + } + default: + return false, fmt.Errorf("unexpected service: %q", s.Service) + } + + } + + checks := ctx.consul.CheckRegs() + if n := len(checks); n != 3 { + return false, fmt.Errorf("expected 3 checks, but found %d", n) + } + for _, check := range checks { + switch check.Name { + case "driver-check": + if expected := "10.1.2.3:1234"; check.TCP != expected { + return false, fmt.Errorf("expected driver-check to have address %q but found %q", expected, check.TCP) + } + case "driver-label-check": + if expected := "10.1.2.3:80"; check.TCP != expected { + return false, fmt.Errorf("expected driver-label-check to have address %q but found %q", expected, check.TCP) + } + case "host-check": + if expected := "192.168.0.100:"; !strings.HasPrefix(check.TCP, expected) { + return false, fmt.Errorf("expected host-check to have address start with %q but found %q", expected, check.TCP) + } + default: + return false, fmt.Errorf("unexpected check: %q", check.Name) + } + } + + return true, nil + }, func(err error) { + services, _ := ctx.consul.Services() + for _, s := range services { + t.Logf(pretty.Sprint("Serivce: ", s)) + } + for _, c := range ctx.consul.CheckRegs() { + t.Logf(pretty.Sprint("Check: ", c)) + } + t.Fatalf("error: %v", err) + }) +} diff --git a/command/agent/consul/catalog_testing.go b/command/agent/consul/catalog_testing.go index 6b28940f114..e0a31f77049 100644 --- a/command/agent/consul/catalog_testing.go +++ b/command/agent/consul/catalog_testing.go @@ -80,6 +80,7 @@ func (c *MockAgent) Services() (map[string]*api.AgentService, error) { return r, nil } +// Checks implements the Agent API Checks method. func (c *MockAgent) Checks() (map[string]*api.AgentCheck, error) { c.mu.Lock() defer c.mu.Unlock() @@ -98,6 +99,19 @@ func (c *MockAgent) Checks() (map[string]*api.AgentCheck, error) { return r, nil } +// CheckRegs returns the raw AgentCheckRegistrations registered with this mock +// agent. +func (c *MockAgent) CheckRegs() []*api.AgentCheckRegistration { + c.mu.Lock() + defer c.mu.Unlock() + + regs := make([]*api.AgentCheckRegistration, 0, len(c.checks)) + for _, check := range c.checks { + regs = append(regs, check) + } + return regs +} + func (c *MockAgent) CheckRegister(check *api.AgentCheckRegistration) error { c.mu.Lock() defer c.mu.Unlock() From 29e6f7cbfd9048768fe26045454fb37fdc281b83 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Wed, 6 Dec 2017 15:40:56 -0800 Subject: [PATCH 03/16] Fix interpolation bug with service/check updates Previously if only an interpolated variable used in a service or check was changed we interpolated the old and new services and checks with the new variable, so nothing appeared to have changed. --- client/task_runner.go | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/client/task_runner.go b/client/task_runner.go index a6d230a62b8..e01cc34cdd6 100644 --- a/client/task_runner.go +++ b/client/task_runner.go @@ -1631,7 +1631,12 @@ func (r *TaskRunner) handleUpdate(update *structs.Allocation) error { // Merge in the task resources updatedTask.Resources = update.TaskResources[updatedTask.Name] - // Update the task's environment for interpolating in services/checks + // Interpolate the old task with the old env before updating the env as + // updating services in Consul need both the old and new interpolations + // to find differences. + oldInterpolatedTask := interpolateServices(r.envBuilder.Build(), r.task) + + // Now it's safe to update the environment r.envBuilder.UpdateTask(update, updatedTask) var mErr multierror.Error @@ -1650,7 +1655,8 @@ func (r *TaskRunner) handleUpdate(update *structs.Allocation) error { } // Update services in Consul - if err := r.updateServices(drv, r.handle, r.task, updatedTask); err != nil { + newInterpolatedTask := interpolateServices(r.envBuilder.Build(), updatedTask) + if err := r.updateServices(drv, r.handle, oldInterpolatedTask, newInterpolatedTask); err != nil { mErr.Errors = append(mErr.Errors, fmt.Errorf("error updating services and checks in Consul: %v", err)) } } @@ -1667,19 +1673,17 @@ func (r *TaskRunner) handleUpdate(update *structs.Allocation) error { return mErr.ErrorOrNil() } -// updateServices and checks with Consul. -func (r *TaskRunner) updateServices(d driver.Driver, h driver.ScriptExecutor, old, new *structs.Task) error { +// updateServices and checks with Consul. Tasks must be interpolated! +func (r *TaskRunner) updateServices(d driver.Driver, h driver.ScriptExecutor, oldTask, newTask *structs.Task) error { var exec driver.ScriptExecutor if d.Abilities().Exec { // Allow set the script executor if the driver supports it exec = h } - newInterpolatedTask := interpolateServices(r.envBuilder.Build(), new) - oldInterpolatedTask := interpolateServices(r.envBuilder.Build(), old) r.driverNetLock.Lock() net := r.driverNet.Copy() r.driverNetLock.Unlock() - return r.consul.UpdateTask(r.alloc.ID, oldInterpolatedTask, newInterpolatedTask, r, exec, net) + return r.consul.UpdateTask(r.alloc.ID, oldTask, newTask, r, exec, net) } // handleDestroy kills the task handle. In the case that killing fails, From f2d47156ec3740227cc4aa02609dad8b2119479c Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Wed, 6 Dec 2017 16:02:24 -0800 Subject: [PATCH 04/16] Document service/check address_mode/port changes --- CHANGELOG.md | 5 ++++ .../docs/job-specification/service.html.md | 23 +++++++++++++------ 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 68c17be6bc7..f7d631e19f0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,10 +22,13 @@ IMPROVEMENTS: * api: Environment variables are ignored during service name validation [GH-3532] * cli: Allocation create and modify times are displayed in a human readable relative format like `6 h ago` [GH-3449] + * client: Support `address_mode` on checks [GH-3619] * client: Sticky volume migrations are now atomic. [GH-3563] * client: Added metrics to track state transitions of allocations [GH-3061] * client: When `network_interface` is unspecified use interface attached to default route [GH-3546] + * client: Support numeric ports on services and checks when + `address_mode="driver"` [GH-3619] * driver/docker: Detect OOM kill event [GH-3459] * driver/docker: Adds support for adding host device to container via `--device` [GH-2938] @@ -57,6 +60,8 @@ BUG FIXES: * client: Fix a panic when restoring an allocation with a dead leader task [GH-3502] * client: Fix crash when following logs from a Windows node [GH-3608] + * client: Fix service/check updating when just interpolated variables change + [GH-3619] * client: Fix allocation accounting in GC and trigger GCs on allocation updates [GH-3445] * driver/rkt: Remove pods on shutdown [GH-3562] diff --git a/website/source/docs/job-specification/service.html.md b/website/source/docs/job-specification/service.html.md index 9f96cac4fb8..4322c539bef 100644 --- a/website/source/docs/job-specification/service.html.md +++ b/website/source/docs/job-specification/service.html.md @@ -99,7 +99,8 @@ does not automatically enable service discovery. - `port` `(string: )` - Specifies the label of the port on which this service is running. Note this is the _label_ of the port and not the port number. The port label must match one defined in the [`network`][network] - stanza. + stanza unless you're also using `address_mode="driver"`. Numeric ports may be + used when in driver addressing mode. - `tags` `(array: [])` - Specifies the list of tags to associate with this service. If this is not supplied, no tags will be assigned to the service @@ -107,11 +108,12 @@ does not automatically enable service discovery. - `address_mode` `(string: "auto")` - Specifies what address (host or driver-specific) this service should advertise. `host` indicates the host IP - and port. `driver` advertises the IP used in the driver (e.g. Docker's internal - IP) and uses the ports specified in the port map. The default is `auto` which - behaves the same as `host` unless the driver determines its IP should be used. - This setting supported Docker since Nomad 0.6 and rkt since Nomad 0.7. It - will advertise the container IP if a network plugin is used (e.g. weave). + and port. `driver` advertises the IP used in the driver (e.g. Docker's + internal IP) and uses the container's port specified in the port map. The + default is `auto` which behaves the same as `host` unless the driver + determines its IP should be used. This setting supported Docker since Nomad + 0.6 and rkt since Nomad 0.7. It will advertise the container IP if a network + plugin is used (e.g. weave). ### `check` Parameters @@ -120,6 +122,11 @@ the script will run inside the Docker container. If your task is running in a chroot, it will run in the chroot. Please keep this in mind when authoring check scripts. +- `address_mode` `(string: "host")` - Same as `address_mode` on `service`. + Unlike services, checks do not have an `auto` address mode as there's no way + for Nomad to know which is the best address to use for checks. Consul needs + access to the address for any HTTP or TCP checks. Added in Nomad 0.7.1. + - `args` `(array: [])` - Specifies additional arguments to the `command`. This only applies to script-based health checks. @@ -161,7 +168,9 @@ scripts. stanza. If a port value was declared on the `service`, this will inherit from that value if not supplied. If supplied, this value takes precedence over the `service.port` value. This is useful for services which operate on multiple - ports. Checks will *always use the host IP and ports*. + ports. Checks will use the host IP and ports by default. In Nomad 0.7.1 or + later numeric ports may be used if `address_mode="driver"` is set on the + check. - `protocol` `(string: "http")` - Specifies the protocol for the http-based health checks. Valid options are `http` and `https`. From f11360acf8a108939c6308a5de3e4e2ddf37e759 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Thu, 7 Dec 2017 21:32:52 -0800 Subject: [PATCH 05/16] Add `Using Driver Address Mode` docs --- .../docs/job-specification/service.html.md | 127 +++++++++++++++++- 1 file changed, 123 insertions(+), 4 deletions(-) diff --git a/website/source/docs/job-specification/service.html.md b/website/source/docs/job-specification/service.html.md index 4322c539bef..eb462c1b01b 100644 --- a/website/source/docs/job-specification/service.html.md +++ b/website/source/docs/job-specification/service.html.md @@ -111,9 +111,10 @@ does not automatically enable service discovery. and port. `driver` advertises the IP used in the driver (e.g. Docker's internal IP) and uses the container's port specified in the port map. The default is `auto` which behaves the same as `host` unless the driver - determines its IP should be used. This setting supported Docker since Nomad - 0.6 and rkt since Nomad 0.7. It will advertise the container IP if a network - plugin is used (e.g. weave). + determines its IP should be used. This setting is supported in Docker since + Nomad 0.6 and rkt since Nomad 0.7. Nomad will advertise the container IP if a + network plugin is used (e.g. weave). See [below for + details.](#using-driver-address-mode) ### `check` Parameters @@ -125,7 +126,8 @@ scripts. - `address_mode` `(string: "host")` - Same as `address_mode` on `service`. Unlike services, checks do not have an `auto` address mode as there's no way for Nomad to know which is the best address to use for checks. Consul needs - access to the address for any HTTP or TCP checks. Added in Nomad 0.7.1. + access to the address for any HTTP or TCP checks. Added in Nomad 0.7.1. See + [below for details.](#using-driver-address-mode) - `args` `(array: [])` - Specifies additional arguments to the `command`. This only applies to script-based health checks. @@ -333,6 +335,123 @@ service { } ``` +### Using Driver Address Mode + +The [Docker](/docs/drivers/docker.html#network_mode) and +[rkt](/docs/drivers/rkt.html#net) drivers support the `driver` setting for the +`address_mode` parameter in both `service` and `check` stanzas. The driver +address mode allows advertising and health checking the IP and port assigned to +a task by the driver. This way if you're using a network plugin like Weave with +Docker, you can advertise the Weave address in Consul instead of the host's +address. + +For example if you were running the example Redis job in an environment with +Weave but Consul was running on the host you could use the following +configuration: + +```hcl +job "example" { + datacenters = ["dc1"] + group "cache" { + + task "redis" { + driver = "docker" + + config { + image = "redis:3.2" + network_mode = "weave" + port_map { + db = 6379 + } + } + + resources { + cpu = 500 # 500 MHz + memory = 256 # 256MB + network { + mbits = 10 + port "db" {} + } + } + + service { + name = "weave-redis" + port = "db" + check { + name = "host-redis-check" + type = "tcp" + interval = "10s" + timeout = "2s" + } + } + } + } +} +``` + +No explicit `address_mode` required! + +Services default to the `auto` address mode. When a Docker network mode other +than "host" or "bridge" is used, services will automatically advertise the +driver's address (in this case Weave's). The service will advertise the +container's port: 6379. + +However since Consul is often run on the host without access to the Weave +network, `check` stanzas default to `host` address mode. The TCP check will run +against the host's IP and the dynamic host port assigned by Nomad. + +Note that the `check` still inherits the `service` stanza's `db` port label, +but each will resolve the port label according to their address mode. + +If Consul has access to the Weave network the job could be configured like +this: + +```hcl +job "example" { + datacenters = ["dc1"] + group "cache" { + + task "redis" { + driver = "docker" + + config { + image = "redis:3.2" + network_mode = "weave" + # No port map required! + } + + resources { + cpu = 500 # 500 MHz + memory = 256 # 256MB + network { + mbits = 10 + } + } + + service { + name = "weave-redis" + port = 6379 + address_mode = "driver" + check { + name = "host-redis-check" + type = "tcp" + interval = "10s" + timeout = "2s" + port = 6379 + + address_mode = "driver" + } + } + } + } +} +``` + +In this case Nomad doesn't need to assign Redis any host ports. The `service` +and `check` stanzas can both specify the port number to advertise and check +directly since Nomad isn't managing any port assignments. + + - - - 1 Script checks are not supported for the From b99935889268be2da4e1485d52d7fb2b171f666d Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Thu, 7 Dec 2017 21:58:15 -0800 Subject: [PATCH 06/16] Validate port label for host address mode Also skip getting an address for script checks which don't use them. Fixed a weird invalid reserved port in a TaskRunner test helper as well as a problem with our mock Alloc/Job. Hopefully the latter doesn't cause other tests to fail, but we were referencing an invalid PortLabel and just not catching it before. --- client/task_runner_test.go | 5 +---- command/agent/consul/client.go | 10 ++++++++++ command/agent/consul/unit_test.go | 12 +++++++++++- nomad/mock/mock.go | 2 +- 4 files changed, 23 insertions(+), 6 deletions(-) diff --git a/client/task_runner_test.go b/client/task_runner_test.go index 8221705071d..d91f5a96bbc 100644 --- a/client/task_runner_test.go +++ b/client/task_runner_test.go @@ -119,9 +119,6 @@ func testTaskRunnerFromAlloc(t *testing.T, restarts bool, alloc *structs.Allocat upd := &MockTaskStateUpdater{} task := alloc.Job.TaskGroups[0].Tasks[0] - // Initialize the port listing. This should be done by the offer process but - // we have a mock so that doesn't happen. - task.Resources.Networks[0].ReservedPorts = []structs.Port{{Label: "", Value: 80}} allocDir := allocdir.NewAllocDir(testLogger(), filepath.Join(conf.AllocDir, alloc.ID)) if err := allocDir.Build(); err != nil { @@ -347,7 +344,7 @@ func TestTaskRunner_Update(t *testing.T) { task.Services[0].Checks[0] = &structs.ServiceCheck{ Name: "http-check", Type: "http", - PortLabel: "web", + PortLabel: "http", Path: "${NOMAD_META_foo}", } task.Driver = "mock_driver" diff --git a/command/agent/consul/client.go b/command/agent/consul/client.go index 574e89dd94f..78ea6429c74 100644 --- a/command/agent/consul/client.go +++ b/command/agent/consul/client.go @@ -632,6 +632,13 @@ func (c *ServiceClient) checkRegs(ops *operations, allocID, serviceID string, se ops.scripts = append(ops.scripts, newScriptCheck( allocID, task.Name, checkID, check, exec, c.client, c.logger, c.shutdownCh)) + // Skip getAddress for script checks + checkReg, err := createCheckReg(serviceID, checkID, check, "", 0) + if err != nil { + return nil, fmt.Errorf("failed to add script check %q: %v", check.Name, err) + } + ops.regChecks = append(ops.regChecks, checkReg) + continue } // Default to the service's port but allow check to override @@ -1098,6 +1105,9 @@ func getAddress(addrMode, portLabel string, networks structs.Networks, driverNet case structs.AddressModeHost: // Default path: use host ip:port ip, port := networks.Port(portLabel) + if ip == "" && port == 0 { + return "", 0, fmt.Errorf("invalid port %q: port label not found", portLabel) + } return ip, port, nil case structs.AddressModeDriver: diff --git a/command/agent/consul/unit_test.go b/command/agent/consul/unit_test.go index 73d82bd1205..f9cec004fc7 100644 --- a/command/agent/consul/unit_test.go +++ b/command/agent/consul/unit_test.go @@ -1541,6 +1541,12 @@ func TestGetAddress(t *testing.T) { }, ErrContains: "invalid port", }, + { + Name: "HostBadPort", + Mode: structs.AddressModeHost, + PortLabel: "bad-port-label", + ErrContains: "invalid port", + }, { Name: "InvalidMode", Mode: "invalid-mode", @@ -1574,7 +1580,11 @@ func TestGetAddress(t *testing.T) { if tc.ErrContains == "" { assert.Nil(t, err) } else { - assert.Contains(t, err.Error(), tc.ErrContains) + if err == nil { + t.Fatalf("expected error containing %q but err=nil", tc.ErrContains) + } else { + assert.Contains(t, err.Error(), tc.ErrContains) + } } }) } diff --git a/nomad/mock/mock.go b/nomad/mock/mock.go index 556dd0b8d05..41813f790f3 100644 --- a/nomad/mock/mock.go +++ b/nomad/mock/mock.go @@ -119,7 +119,7 @@ func Job() *structs.Job { }, { Name: "${TASK}-admin", - PortLabel: "admin", + PortLabel: "main", }, }, LogConfig: structs.DefaultLogConfig(), From fa5faa5a097fdd599e352da80d34596d6671d562 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Thu, 7 Dec 2017 22:04:22 -0800 Subject: [PATCH 07/16] Prevent using port 0 with address_mode=driver --- command/agent/consul/client.go | 3 +++ command/agent/consul/unit_test.go | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/command/agent/consul/client.go b/command/agent/consul/client.go index 78ea6429c74..732b3aed01d 100644 --- a/command/agent/consul/client.go +++ b/command/agent/consul/client.go @@ -1126,6 +1126,9 @@ func getAddress(addrMode, portLabel string, networks structs.Networks, driverNet if err != nil { return "", 0, fmt.Errorf("invalid port %q: %v", portLabel, err) } + if port == 0 { + return "", 0, fmt.Errorf("invalid port: %q: port 0 is invalid", portLabel) + } return driverNet.IP, port, nil diff --git a/command/agent/consul/unit_test.go b/command/agent/consul/unit_test.go index f9cec004fc7..672ef376ed3 100644 --- a/command/agent/consul/unit_test.go +++ b/command/agent/consul/unit_test.go @@ -1541,6 +1541,15 @@ func TestGetAddress(t *testing.T) { }, ErrContains: "invalid port", }, + { + Name: "DriverZeroPort", + Mode: structs.AddressModeDriver, + PortLabel: "0", + Driver: &cstructs.DriverNetwork{ + IP: "10.1.2.3", + }, + ErrContains: "invalid port", + }, { Name: "HostBadPort", Mode: structs.AddressModeHost, From 7bf5cec82edf6a694e2f245f0116c9236f87fefe Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Thu, 7 Dec 2017 22:09:37 -0800 Subject: [PATCH 08/16] Improve port label validation and diff testing --- nomad/structs/diff_test.go | 6 ++++++ nomad/structs/structs.go | 10 +++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/nomad/structs/diff_test.go b/nomad/structs/diff_test.go index 379f1a23995..4574bcb64c3 100644 --- a/nomad/structs/diff_test.go +++ b/nomad/structs/diff_test.go @@ -3507,6 +3507,12 @@ func TestTaskDiff(t *testing.T) { Type: DiffTypeEdited, Name: "Check", Fields: []*FieldDiff{ + { + Type: DiffTypeNone, + Name: "AddressMode", + Old: "", + New: "", + }, { Type: DiffTypeNone, Name: "Command", diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index fa29071aa1c..7fdc06fdbbe 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -3473,10 +3473,14 @@ func validateServices(t *Task) error { knownServices[service.Name+service.PortLabel] = struct{}{} if service.PortLabel != "" { - if _, err := strconv.Atoi(service.PortLabel); service.AddressMode == "driver" && err == nil { - // Numeric ports are valid when AddressMode=driver + if service.AddressMode == "driver" { + // Numeric port labels are valid for address_mode=driver + _, err := strconv.Atoi(service.PortLabel) + if err != nil { + // Not a numeric port label, add it to list to check + servicePorts[service.PortLabel] = append(servicePorts[service.PortLabel], service.Name) + } } else { - servicePorts[service.PortLabel] = append(servicePorts[service.PortLabel], service.Name) } } From 65bfbe54c8a46d0a2a39815a2ffa9534e27532f8 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Thu, 7 Dec 2017 17:08:25 -0800 Subject: [PATCH 09/16] Hash fields used in task service IDs Fixes #3620 Previously we concatenated tags into task service IDs. This could break deregistration of tag names that contained double //s like some Fabio tags. This change breaks service ID backward compatibility so on upgrade all users services and checks will be removed and re-added with new IDs. This change has the side effect of including all service fields in the ID's hash, so we no longer have to track PortLabel and AddressMode changes independently. --- CHANGELOG.md | 5 + command/agent/consul/client.go | 160 +++++++++++++++++------------- command/agent/consul/int_test.go | 7 +- command/agent/consul/unit_test.go | 23 +++-- nomad/structs/structs.go | 11 -- 5 files changed, 116 insertions(+), 90 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f7d631e19f0..2cd9eac035d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,9 @@ ## 0.7.1 (Unreleased) __BACKWARDS INCOMPATIBILITIES:__ + * client: The format of service IDs in Consul has changed. If you rely upon + Nomad's service IDs (*not* service names; those are stable), you will need + to update your code. [GH-3632] * config: Nomad no longer parses Atlas configuration stanzas. Atlas has been deprecated since earlier this year. If you have an Atlas stanza in your config file it will have to be removed. @@ -57,6 +60,8 @@ BUG FIXES: explicitly [GH-3520] * cli: Fix passing Consul address via flags [GH-3504] * cli: Fix panic when running `keyring` commands [GH-3509] + * client: Fix advertising services with tags that require URL escaping + [GH-3632] * client: Fix a panic when restoring an allocation with a dead leader task [GH-3502] * client: Fix crash when following logs from a Windows node [GH-3608] diff --git a/command/agent/consul/client.go b/command/agent/consul/client.go index 732b3aed01d..636ecb7b19d 100644 --- a/command/agent/consul/client.go +++ b/command/agent/consul/client.go @@ -2,7 +2,10 @@ package consul import ( "context" + "crypto/sha1" + "encoding/base32" "fmt" + "io" "log" "net" "net/url" @@ -21,10 +24,14 @@ import ( ) const ( - // nomadServicePrefix is the first prefix that scopes all Nomad registered - // services + // nomadServicePrefix is the prefix that scopes all Nomad registered + // services (both agent and task entries). nomadServicePrefix = "_nomad" + // nomadTaskPrefix is the prefix that scopes Nomad registered services + // for tasks. + nomadTaskPrefix = nomadServicePrefix + "-task-" + // defaultRetryInterval is how quickly to retry syncing services and // checks to Consul when an error occurs. Will backoff up to a max. defaultRetryInterval = time.Second @@ -288,8 +295,13 @@ func (c *ServiceClient) Run() { if err := c.sync(); err != nil { if failures == 0 { + // Log on the first failure c.logger.Printf("[WARN] consul.sync: failed to update services in Consul: %v", err) + } else if failures%10 == 0 { + // Log every 10th consecutive failure + c.logger.Printf("[ERR] consul.sync: still unable to update services in Consul after %d failures; latest error: %v", failures, err) } + failures++ if !retryTimer.Stop() { // Timer already expired, since the timer may @@ -389,8 +401,14 @@ func (c *ServiceClient) sync() error { // Not managed by Nomad, skip continue } + // Unknown Nomad managed service; kill if err := c.client.ServiceDeregister(id); err != nil { + if isOldNomadService(id) { + // Don't hard-fail on old entries. See #3620 + continue + } + metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) return err } @@ -398,29 +416,16 @@ func (c *ServiceClient) sync() error { metrics.IncrCounter([]string{"client", "consul", "service_deregistrations"}, 1) } - // Track services whose ports have changed as their checks may also - // need updating - portsChanged := make(map[string]struct{}, len(c.services)) - // Add Nomad services missing from Consul for id, locals := range c.services { - if remotes, ok := consulServices[id]; ok { - // Make sure Port and Address are stable since - // PortLabel and AddressMode aren't included in the - // service ID. - if locals.Port == remotes.Port && locals.Address == remotes.Address { - // Already exists in Consul; skip - continue + if _, ok := consulServices[id]; !ok { + if err = c.client.ServiceRegister(locals); err != nil { + metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) + return err } - // Port changed, reregister it and its checks - portsChanged[id] = struct{}{} - } - if err = c.client.ServiceRegister(locals); err != nil { - metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) - return err + sreg++ + metrics.IncrCounter([]string{"client", "consul", "service_registrations"}, 1) } - sreg++ - metrics.IncrCounter([]string{"client", "consul", "service_registrations"}, 1) } // Remove Nomad checks in Consul but unknown locally @@ -433,8 +438,14 @@ func (c *ServiceClient) sync() error { // Service not managed by Nomad, skip continue } - // Unknown Nomad managed check; kill + + // Unknown Nomad managed check; remove if err := c.client.CheckDeregister(id); err != nil { + if isOldNomadService(check.ServiceID) { + // Don't hard-fail on old entries. + continue + } + metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) return err } @@ -444,12 +455,11 @@ func (c *ServiceClient) sync() error { // Add Nomad checks missing from Consul for id, check := range c.checks { - if check, ok := consulChecks[id]; ok { - if _, changed := portsChanged[check.ServiceID]; !changed { - // Already in Consul and ports didn't change; skipping - continue - } + if _, ok := consulChecks[id]; ok { + // Already in Consul; skipping + continue } + if err := c.client.CheckRegister(check); err != nil { metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) return err @@ -751,6 +761,9 @@ func (c *ServiceClient) UpdateTask(allocID string, existing, newTask *structs.Ta continue } + // Service exists and hasn't changed, don't re-add it later + delete(newIDs, existingID) + // Service still exists so add it to the task's registration sreg := &ServiceRegistration{ serviceID: existingID, @@ -758,15 +771,7 @@ func (c *ServiceClient) UpdateTask(allocID string, existing, newTask *structs.Ta } taskReg.Services[existingID] = sreg - // PortLabel and AddressMode aren't included in the ID, so we - // have to compare manually. - serviceUnchanged := newSvc.PortLabel == existingSvc.PortLabel && newSvc.AddressMode == existingSvc.AddressMode - if serviceUnchanged { - // Service exists and hasn't changed, don't add it later - delete(newIDs, existingID) - } - - // See what checks were updated + // See if any checks were updated existingChecks := make(map[string]*structs.ServiceCheck, len(existingSvc.Checks)) for _, check := range existingSvc.Checks { existingChecks[makeCheckID(existingID, check)] = check @@ -779,17 +784,16 @@ func (c *ServiceClient) UpdateTask(allocID string, existing, newTask *structs.Ta // Check exists, so don't remove it delete(existingChecks, checkID) sreg.checkIDs[checkID] = struct{}{} - } else if serviceUnchanged { - // New check on an unchanged service; add them now - newCheckIDs, err := c.checkRegs(ops, allocID, existingID, newSvc, newTask, exec, net) - if err != nil { - return err - } + } - for _, checkID := range newCheckIDs { - sreg.checkIDs[checkID] = struct{}{} + // New check on an unchanged service; add them now + newCheckIDs, err := c.checkRegs(ops, allocID, existingID, newSvc, newTask, exec, net) + if err != nil { + return err + } - } + for _, checkID := range newCheckIDs { + sreg.checkIDs[checkID] = struct{}{} } @@ -999,36 +1003,40 @@ func (c *ServiceClient) removeTaskRegistration(allocID, taskName string) { // // Agent service IDs are of the form: // -// {nomadServicePrefix}-{ROLE}-{Service.Name}-{Service.Tags...} -// Example Server ID: _nomad-server-nomad-serf -// Example Client ID: _nomad-client-nomad-client-http +// {nomadServicePrefix}-{ROLE}-b32(sha1({Service.Name}-{Service.Tags...}) +// Example Server ID: _nomad-server-FBBK265QN4TMT25ND4EP42TJVMYJ3HR4 +// Example Client ID: _nomad-client-GGNJPGL7YN7RGMVXZILMPVRZZVRSZC7L // func makeAgentServiceID(role string, service *structs.Service) string { - parts := make([]string, len(service.Tags)+3) - parts[0] = nomadServicePrefix - parts[1] = role - parts[2] = service.Name - copy(parts[3:], service.Tags) - return strings.Join(parts, "-") + h := sha1.New() + io.WriteString(h, service.Name) + for _, tag := range service.Tags { + io.WriteString(h, tag) + } + b32 := base32.StdEncoding.EncodeToString(h.Sum(nil)) + return fmt.Sprintf("%s-%s-%s", nomadServicePrefix, role, b32) } // makeTaskServiceID creates a unique ID for identifying a task service in -// Consul. -// -// Task service IDs are of the form: -// -// {nomadServicePrefix}-executor-{ALLOC_ID}-{Service.Name}-{Service.Tags...} -// Example Service ID: _nomad-executor-1234-echo-http-tag1-tag2-tag3 +// Consul. All structs.Service fields are included in the ID's hash except +// Checks. This allows updates to merely compare IDs. // +// Example Service ID: _nomad-task-TNM333JKJPM5AK4FAS3VXQLXFDWOF4VH func makeTaskServiceID(allocID, taskName string, service *structs.Service) string { - parts := make([]string, len(service.Tags)+5) - parts[0] = nomadServicePrefix - parts[1] = "executor" - parts[2] = allocID - parts[3] = taskName - parts[4] = service.Name - copy(parts[5:], service.Tags) - return strings.Join(parts, "-") + h := sha1.New() + io.WriteString(h, allocID) + io.WriteString(h, taskName) + io.WriteString(h, service.Name) + io.WriteString(h, service.PortLabel) + io.WriteString(h, service.AddressMode) + for _, tag := range service.Tags { + io.WriteString(h, tag) + } + + // Base32 is used for encoding the hash as sha1 hashes can always be + // encoded without padding, only 4 bytes larger than base64, and saves + // 8 bytes vs hex. + return nomadTaskPrefix + base32.StdEncoding.EncodeToString(h.Sum(nil)) } // makeCheckID creates a unique ID for a check. @@ -1084,9 +1092,21 @@ func createCheckReg(serviceID, checkID string, check *structs.ServiceCheck, host } // isNomadService returns true if the ID matches the pattern of a Nomad managed -// service. Agent services return false as independent client and server agents -// may be running on the same machine. #2827 +// service (new or old formats). Agent services return false as independent +// client and server agents may be running on the same machine. #2827 func isNomadService(id string) bool { + return strings.HasPrefix(id, nomadTaskPrefix) || isOldNomadService(id) +} + +// isOldNomadService returns true if the ID matches an old pattern managed by +// Nomad. +// +// Pre-0.7.1 task service IDs are of the form: +// +// {nomadServicePrefix}-executor-{ALLOC_ID}-{Service.Name}-{Service.Tags...} +// Example Service ID: _nomad-executor-1234-echo-http-tag1-tag2-tag3 +// +func isOldNomadService(id string) bool { const prefix = nomadServicePrefix + "-executor" return strings.HasPrefix(id, prefix) } diff --git a/command/agent/consul/int_test.go b/command/agent/consul/int_test.go index 9295a696c9a..ba9975532a0 100644 --- a/command/agent/consul/int_test.go +++ b/command/agent/consul/int_test.go @@ -116,7 +116,12 @@ func TestConsul_Integration(t *testing.T) { { Name: "httpd2", PortLabel: "http", - Tags: []string{"test", "http2"}, + Tags: []string{ + "test", + // Use URL-unfriendly tags to test #3620 + "public-test.ettaviation.com:80/ redirect=302,https://test.ettaviation.com", + "public-test.ettaviation.com:443/", + }, }, } diff --git a/command/agent/consul/unit_test.go b/command/agent/consul/unit_test.go index 672ef376ed3..88acc4ee152 100644 --- a/command/agent/consul/unit_test.go +++ b/command/agent/consul/unit_test.go @@ -220,8 +220,8 @@ func TestConsul_ChangeTags(t *testing.T) { } // TestConsul_ChangePorts asserts that changing the ports on a service updates -// it in Consul. Since ports are not part of the service ID this is a slightly -// different code path than changing tags. +// it in Consul. Pre-0.7.1 ports were not part of the service ID and this was a +// slightly different code path than changing tags. func TestConsul_ChangePorts(t *testing.T) { ctx := setupFake() ctx.Task.Services[0].Checks = []*structs.ServiceCheck{ @@ -349,8 +349,8 @@ func TestConsul_ChangePorts(t *testing.T) { } for k, v := range ctx.FakeConsul.services { - if k != origServiceKey { - t.Errorf("unexpected key change; was: %q -- but found %q", origServiceKey, k) + if k == origServiceKey { + t.Errorf("expected key change; still: %q", k) } if v.Name != ctx.Task.Services[0].Name { t.Errorf("expected Name=%q != %q", ctx.Task.Services[0].Name, v.Name) @@ -370,15 +370,15 @@ func TestConsul_ChangePorts(t *testing.T) { for k, v := range ctx.FakeConsul.checks { switch v.Name { case "c1": - if k != origTCPKey { - t.Errorf("unexpected key change for %s from %q to %q", v.Name, origTCPKey, k) + if k == origTCPKey { + t.Errorf("expected key change for %s from %q", v.Name, origTCPKey) } if expected := fmt.Sprintf(":%d", xPort); v.TCP != expected { t.Errorf("expected Port x=%v but found: %v", expected, v.TCP) } case "c2": - if k != origScriptKey { - t.Errorf("unexpected key change for %s from %q to %q", v.Name, origScriptKey, k) + if k == origScriptKey { + t.Errorf("expected key change for %s from %q", v.Name, origScriptKey) } select { case <-ctx.execs: @@ -1383,9 +1383,16 @@ func TestIsNomadService(t *testing.T) { }{ {"_nomad-client-nomad-client-http", false}, {"_nomad-server-nomad-serf", false}, + + // Pre-0.7.1 style IDs still match {"_nomad-executor-abc", true}, {"_nomad-executor", true}, + + // Post-0.7.1 style IDs match + {"_nomad-task-FBBK265QN4TMT25ND4EP42TJVMYJ3HR4", true}, + {"not-nomad", false}, + {"_nomad", false}, } for _, test := range tests { diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 7fdc06fdbbe..7884bbed067 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -3140,17 +3140,6 @@ func (s *Service) ValidateName(name string) error { return nil } -// Hash calculates the hash of the check based on it's content and the service -// which owns it -func (s *Service) Hash() string { - h := sha1.New() - io.WriteString(h, s.Name) - io.WriteString(h, strings.Join(s.Tags, "")) - io.WriteString(h, s.PortLabel) - io.WriteString(h, s.AddressMode) - return fmt.Sprintf("%x", h.Sum(nil)) -} - const ( // DefaultKillTimeout is the default timeout between signaling a task it // will be killed and killing it. From afd5bca5663df384fa7948e2db49585caf210eff Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Fri, 8 Dec 2017 11:50:14 -0800 Subject: [PATCH 10/16] Move service hash logic to Service.Hash method --- command/agent/consul/client.go | 15 +-------------- nomad/structs/structs.go | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/command/agent/consul/client.go b/command/agent/consul/client.go index 636ecb7b19d..a26befa3116 100644 --- a/command/agent/consul/client.go +++ b/command/agent/consul/client.go @@ -1023,20 +1023,7 @@ func makeAgentServiceID(role string, service *structs.Service) string { // // Example Service ID: _nomad-task-TNM333JKJPM5AK4FAS3VXQLXFDWOF4VH func makeTaskServiceID(allocID, taskName string, service *structs.Service) string { - h := sha1.New() - io.WriteString(h, allocID) - io.WriteString(h, taskName) - io.WriteString(h, service.Name) - io.WriteString(h, service.PortLabel) - io.WriteString(h, service.AddressMode) - for _, tag := range service.Tags { - io.WriteString(h, tag) - } - - // Base32 is used for encoding the hash as sha1 hashes can always be - // encoded without padding, only 4 bytes larger than base64, and saves - // 8 bytes vs hex. - return nomadTaskPrefix + base32.StdEncoding.EncodeToString(h.Sum(nil)) + return nomadTaskPrefix + service.Hash(allocID, taskName) } // makeCheckID creates a unique ID for a check. diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 7884bbed067..a2a8f4efda3 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -6,6 +6,7 @@ import ( "crypto/sha1" "crypto/sha256" "crypto/sha512" + "encoding/base32" "encoding/hex" "errors" "fmt" @@ -3140,6 +3141,26 @@ func (s *Service) ValidateName(name string) error { return nil } +// Hash returns a base32 encoded hash of a Service's contents excluding checks +// as they're hashed independently. +func (s *Service) Hash(allocID, taskName string) string { + h := sha1.New() + io.WriteString(h, allocID) + io.WriteString(h, taskName) + io.WriteString(h, s.Name) + io.WriteString(h, s.PortLabel) + io.WriteString(h, s.AddressMode) + for _, tag := range s.Tags { + io.WriteString(h, tag) + } + + // Base32 is used for encoding the hash as sha1 hashes can always be + // encoded without padding, only 4 bytes larger than base64, and saves + // 8 bytes vs hex. Since these hashes are used in Consul URLs it's nice + // to have a reasonably compact URL-safe representation. + return base32.StdEncoding.EncodeToString(h.Sum(nil)) +} + const ( // DefaultKillTimeout is the default timeout between signaling a task it // will be killed and killing it. From 6f124eba7d5283f650488f9f462e1b3a0f19e413 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Fri, 8 Dec 2017 12:27:57 -0800 Subject: [PATCH 11/16] Be more defensive in port checks --- command/agent/consul/client.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/command/agent/consul/client.go b/command/agent/consul/client.go index a26befa3116..93e001acbe7 100644 --- a/command/agent/consul/client.go +++ b/command/agent/consul/client.go @@ -1112,7 +1112,7 @@ func getAddress(addrMode, portLabel string, networks structs.Networks, driverNet case structs.AddressModeHost: // Default path: use host ip:port ip, port := networks.Port(portLabel) - if ip == "" && port == 0 { + if ip == "" && port <= 0 { return "", 0, fmt.Errorf("invalid port %q: port label not found", portLabel) } return ip, port, nil @@ -1133,7 +1133,7 @@ func getAddress(addrMode, portLabel string, networks structs.Networks, driverNet if err != nil { return "", 0, fmt.Errorf("invalid port %q: %v", portLabel, err) } - if port == 0 { + if port <= 0 { return "", 0, fmt.Errorf("invalid port: %q: port 0 is invalid", portLabel) } From 41963605d0d78ce5dfcc23cd648a524cdb59d87d Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Fri, 8 Dec 2017 12:39:50 -0800 Subject: [PATCH 12/16] Expand port and address_mode docs --- .../docs/job-specification/service.html.md | 48 +++++++++++-------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/website/source/docs/job-specification/service.html.md b/website/source/docs/job-specification/service.html.md index eb462c1b01b..9707d8ec47f 100644 --- a/website/source/docs/job-specification/service.html.md +++ b/website/source/docs/job-specification/service.html.md @@ -98,23 +98,32 @@ does not automatically enable service discovery. - `port` `(string: )` - Specifies the label of the port on which this service is running. Note this is the _label_ of the port and not the port - number. The port label must match one defined in the [`network`][network] - stanza unless you're also using `address_mode="driver"`. Numeric ports may be - used when in driver addressing mode. + number unless `address_mode = driver`. The port label must match one defined + in the [`network`][network] stanza unless you're also using + `address_mode="driver"`. Numeric ports may be used when in driver addressing + mode. - `tags` `(array: [])` - Specifies the list of tags to associate with this service. If this is not supplied, no tags will be assigned to the service when it is registered. - `address_mode` `(string: "auto")` - Specifies what address (host or - driver-specific) this service should advertise. `host` indicates the host IP - and port. `driver` advertises the IP used in the driver (e.g. Docker's - internal IP) and uses the container's port specified in the port map. The - default is `auto` which behaves the same as `host` unless the driver - determines its IP should be used. This setting is supported in Docker since - Nomad 0.6 and rkt since Nomad 0.7. Nomad will advertise the container IP if a - network plugin is used (e.g. weave). See [below for - details.](#using-driver-address-mode) + driver-specific) this service should advertise. This setting is supported in + Docker since Nomad 0.6 and rkt since Nomad 0.7. See [below for + examples.](#using-driver-address-mode) Valid options are: + + - `auto` - Allows the driver to determine whether the host or driver address + should be used. Defaults to `host` and only implemented by Docker. If you + use a Docker network plugin such as weave, Docker will automatically use + its address. + + - `driver` - Use the IP specified by the driver, and the port specified in a + port map. A numeric port may be specified since port maps aren't required + by all network plugins. Useful for advertising SDN and overlay network + addresses. Task will fail if driver network cannot be determined. Only + implemented for Docker and rkt. + + - `host` - Use the host IP and port. ### `check` Parameters @@ -127,7 +136,8 @@ scripts. Unlike services, checks do not have an `auto` address mode as there's no way for Nomad to know which is the best address to use for checks. Consul needs access to the address for any HTTP or TCP checks. Added in Nomad 0.7.1. See - [below for details.](#using-driver-address-mode) + [below for details.](#using-driver-address-mode) Unlike `port`, this setting + is *not* inherited from the `service`. - `args` `(array: [])` - Specifies additional arguments to the `command`. This only applies to script-based health checks. @@ -166,13 +176,13 @@ scripts. - `port` `(string: )` - Specifies the label of the port on which the check will be performed. Note this is the _label_ of the port and not the port - number. The port label must match one defined in the [`network`][network] - stanza. If a port value was declared on the `service`, this will inherit from - that value if not supplied. If supplied, this value takes precedence over the - `service.port` value. This is useful for services which operate on multiple - ports. Checks will use the host IP and ports by default. In Nomad 0.7.1 or - later numeric ports may be used if `address_mode="driver"` is set on the - check. + number unless `address_mode = driver`. The port label must match one defined + in the [`network`][network] stanza. If a port value was declared on the + `service`, this will inherit from that value if not supplied. If supplied, + this value takes precedence over the `service.port` value. This is useful for + services which operate on multiple ports. Checks will use the host IP and + ports by default. In Nomad 0.7.1 or later numeric ports may be used if + `address_mode="driver"` is set on the check. - `protocol` `(string: "http")` - Specifies the protocol for the http-based health checks. Valid options are `http` and `https`. From 2a33e10375eab404f4ec61d0fca03dfde2655d15 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Fri, 8 Dec 2017 13:49:57 -0800 Subject: [PATCH 13/16] Improve validation and testing of service/check ports --- nomad/structs/structs.go | 61 ++++++++++++++++++++-- nomad/structs/structs_test.go | 96 +++++++++++++++++++++++++++++++++++ 2 files changed, 152 insertions(+), 5 deletions(-) diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index a2a8f4efda3..1f4f533aa95 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -3467,7 +3467,13 @@ func validateServices(t *Task) error { // Ensure that services don't ask for non-existent ports and their names are // unique. - servicePorts := make(map[string][]string) + servicePorts := make(map[string]map[string]struct{}) + addServicePort := func(label, service string) { + if _, ok := servicePorts[label]; !ok { + servicePorts[label] = map[string]struct{}{} + } + servicePorts[label][service] = struct{}{} + } knownServices := make(map[string]struct{}) for i, service := range t.Services { if err := service.Validate(); err != nil { @@ -3488,20 +3494,58 @@ func validateServices(t *Task) error { _, err := strconv.Atoi(service.PortLabel) if err != nil { // Not a numeric port label, add it to list to check - servicePorts[service.PortLabel] = append(servicePorts[service.PortLabel], service.Name) + addServicePort(service.PortLabel, service.Name) } } else { - servicePorts[service.PortLabel] = append(servicePorts[service.PortLabel], service.Name) + addServicePort(service.PortLabel, service.Name) } } - // Ensure that check names are unique. + // Ensure that check names are unique and have valid ports knownChecks := make(map[string]struct{}) for _, check := range service.Checks { if _, ok := knownChecks[check.Name]; ok { mErr.Errors = append(mErr.Errors, fmt.Errorf("check %q is duplicate", check.Name)) } knownChecks[check.Name] = struct{}{} + + if !check.RequiresPort() { + // No need to continue validating check if it doesn't need a port + continue + } + + effectivePort := check.PortLabel + if effectivePort == "" { + // Inherits from service + effectivePort = service.PortLabel + } + + if effectivePort == "" { + mErr.Errors = append(mErr.Errors, fmt.Errorf("check %q is missing a port", check.Name)) + continue + } + + isNumeric := false + portNumber, err := strconv.Atoi(effectivePort) + if err == nil { + isNumeric = true + } + + // Numeric ports are fine for address_mode = "driver" + if check.AddressMode == "driver" && isNumeric { + if portNumber <= 0 { + mErr.Errors = append(mErr.Errors, fmt.Errorf("check %q has invalid numeric port %d", check.Name, portNumber)) + } + continue + } + + if isNumeric { + mErr.Errors = append(mErr.Errors, fmt.Errorf(`check %q cannot use a numeric port %d without setting address_mode="driver"`, check.Name, portNumber)) + continue + } + + // PortLabel must exist, report errors by its parent service + addServicePort(effectivePort, service.Name) } } @@ -3520,7 +3564,14 @@ func validateServices(t *Task) error { for servicePort, services := range servicePorts { _, ok := portLabels[servicePort] if !ok { - joined := strings.Join(services, ", ") + names := make([]string, 0, len(services)) + for name := range services { + names = append(names, name) + } + + // Keep order deterministic + sort.Strings(names) + joined := strings.Join(names, ", ") err := fmt.Errorf("port label %q referenced by services %v does not exist", servicePort, joined) mErr.Errors = append(mErr.Errors, err) } diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go index 3840b9cf40e..07db00439ce 100644 --- a/nomad/structs/structs_test.go +++ b/nomad/structs/structs_test.go @@ -1180,6 +1180,102 @@ func TestTask_Validate_Service_Check(t *testing.T) { } } +// TestTask_Validate_Service_Check_AddressMode asserts that checks do not +// inherit address mode but do inherit ports. +func TestTask_Validate_Service_Check_AddressMode(t *testing.T) { + task := &Task{ + Resources: &Resources{ + Networks: []*NetworkResource{ + { + DynamicPorts: []Port{ + { + Label: "http", + Value: 9999, + }, + }, + }, + }, + }, + Services: []*Service{ + { + Name: "invalid-driver", + PortLabel: "80", + AddressMode: "host", + }, + { + Name: "http-driver", + PortLabel: "80", + AddressMode: "driver", + Checks: []*ServiceCheck{ + { + // Should fail + Name: "invalid-check-1", + Type: "tcp", + Interval: time.Second, + Timeout: time.Second, + }, + { + // Should fail + Name: "invalid-check-2", + Type: "tcp", + PortLabel: "80", + Interval: time.Second, + Timeout: time.Second, + }, + { + // Should fail + Name: "invalid-check-3", + Type: "tcp", + PortLabel: "missing-port-label", + Interval: time.Second, + Timeout: time.Second, + }, + { + // Should pass + Name: "valid-script-check", + Type: "script", + Command: "ok", + Interval: time.Second, + Timeout: time.Second, + }, + { + // Should pass + Name: "valid-host-check", + Type: "tcp", + PortLabel: "http", + Interval: time.Second, + Timeout: time.Second, + }, + { + // Should pass + Name: "valid-driver-check", + Type: "tcp", + AddressMode: "driver", + Interval: time.Second, + Timeout: time.Second, + }, + }, + }, + }, + } + err := validateServices(task) + if err == nil { + t.Fatalf("expected errors but task validated successfully") + } + errs := err.(*multierror.Error).Errors + if expected := 4; len(errs) != expected { + for i, err := range errs { + t.Logf("errs[%d] -> %s", i, err) + } + t.Fatalf("expected %d errors but found %d", expected, len(errs)) + } + + assert.Contains(t, errs[0].Error(), `check "invalid-check-1" cannot use a numeric port`) + assert.Contains(t, errs[1].Error(), `check "invalid-check-2" cannot use a numeric port`) + assert.Contains(t, errs[2].Error(), `port label "80" referenced`) + assert.Contains(t, errs[3].Error(), `port label "missing-port-label" referenced`) +} + func TestTask_Validate_Service_Check_CheckRestart(t *testing.T) { invalidCheckRestart := &CheckRestart{ Limit: -1, From e857225129c2476975b22cf4a9f8bd331e18053d Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Fri, 8 Dec 2017 14:50:06 -0800 Subject: [PATCH 14/16] Fix port labels on mock Alloc/Job/Node --- nomad/mock/mock.go | 15 +++++++++------ scheduler/generic_sched_test.go | 2 +- scheduler/system_sched_test.go | 2 +- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/nomad/mock/mock.go b/nomad/mock/mock.go index 41813f790f3..d0c889b0958 100644 --- a/nomad/mock/mock.go +++ b/nomad/mock/mock.go @@ -41,7 +41,7 @@ func Node() *structs.Node { { Device: "eth0", IP: "192.168.0.100", - ReservedPorts: []structs.Port{{Label: "main", Value: 22}}, + ReservedPorts: []structs.Port{{Label: "ssh", Value: 22}}, MBits: 1, }, }, @@ -119,7 +119,7 @@ func Job() *structs.Job { }, { Name: "${TASK}-admin", - PortLabel: "main", + PortLabel: "admin", }, }, LogConfig: structs.DefaultLogConfig(), @@ -128,8 +128,11 @@ func Job() *structs.Job { MemoryMB: 256, Networks: []*structs.NetworkResource{ { - MBits: 50, - DynamicPorts: []structs.Port{{Label: "http"}, {Label: "admin"}}, + MBits: 50, + DynamicPorts: []structs.Port{ + {Label: "http"}, + {Label: "admin"}, + }, }, }, }, @@ -273,7 +276,7 @@ func Alloc() *structs.Allocation { { Device: "eth0", IP: "192.168.0.100", - ReservedPorts: []structs.Port{{Label: "main", Value: 5000}}, + ReservedPorts: []structs.Port{{Label: "admin", Value: 5000}}, MBits: 50, DynamicPorts: []structs.Port{{Label: "http"}}, }, @@ -287,7 +290,7 @@ func Alloc() *structs.Allocation { { Device: "eth0", IP: "192.168.0.100", - ReservedPorts: []structs.Port{{Label: "main", Value: 5000}}, + ReservedPorts: []structs.Port{{Label: "admin", Value: 5000}}, MBits: 50, DynamicPorts: []structs.Port{{Label: "http"}}, }, diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go index cdc415eb12d..02296ff76a8 100644 --- a/scheduler/generic_sched_test.go +++ b/scheduler/generic_sched_test.go @@ -1893,7 +1893,7 @@ func TestServiceSched_JobModify_InPlace(t *testing.T) { h.AssertEvalStatus(t, structs.EvalStatusComplete) // Verify the network did not change - rp := structs.Port{Label: "main", Value: 5000} + rp := structs.Port{Label: "admin", Value: 5000} for _, alloc := range out { for _, resources := range alloc.TaskResources { if resources.Networks[0].ReservedPorts[0] != rp { diff --git a/scheduler/system_sched_test.go b/scheduler/system_sched_test.go index 2b25fbb56c5..be43026a1a5 100644 --- a/scheduler/system_sched_test.go +++ b/scheduler/system_sched_test.go @@ -777,7 +777,7 @@ func TestSystemSched_JobModify_InPlace(t *testing.T) { h.AssertEvalStatus(t, structs.EvalStatusComplete) // Verify the network did not change - rp := structs.Port{Label: "main", Value: 5000} + rp := structs.Port{Label: "admin", Value: 5000} for _, alloc := range out { for _, resources := range alloc.TaskResources { if resources.Networks[0].ReservedPorts[0] != rp { From 63d515de4d30d7e884355cdc6f9efcdf6de408b1 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Fri, 8 Dec 2017 15:54:04 -0800 Subject: [PATCH 15/16] Lowercase service IDs to prevent eye bleeding --- nomad/structs/structs.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 1f4f533aa95..52c0f141065 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -45,6 +45,9 @@ var ( // validPolicyName is used to validate a policy name validPolicyName = regexp.MustCompile("^[a-zA-Z0-9-]{1,128}$") + + // b32 is a lowercase base32 encoding for use in URL friendly service hashes + b32 = base32.NewEncoding(strings.ToLower("abcdefghijklmnopqrstuvwxyz234567")) ) type MessageType uint8 @@ -3158,7 +3161,7 @@ func (s *Service) Hash(allocID, taskName string) string { // encoded without padding, only 4 bytes larger than base64, and saves // 8 bytes vs hex. Since these hashes are used in Consul URLs it's nice // to have a reasonably compact URL-safe representation. - return base32.StdEncoding.EncodeToString(h.Sum(nil)) + return b32.EncodeToString(h.Sum(nil)) } const ( From b79194f96cbcba94a4dc1baf244b5651adea30c7 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Fri, 8 Dec 2017 16:45:25 -0800 Subject: [PATCH 16/16] Fix test broken by mock updates --- client/driver/driver_test.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/client/driver/driver_test.go b/client/driver/driver_test.go index 9bdb882a02c..c2d59541c76 100644 --- a/client/driver/driver_test.go +++ b/client/driver/driver_test.go @@ -209,12 +209,12 @@ func setupTaskEnv(t *testing.T, driver string) (*allocdir.TaskDir, map[string]st "NOMAD_PORT_two": "443", "NOMAD_HOST_PORT_two": "443", "NOMAD_ADDR_admin": "1.2.3.4:8081", - "NOMAD_ADDR_web_main": "192.168.0.100:5000", + "NOMAD_ADDR_web_admin": "192.168.0.100:5000", "NOMAD_ADDR_web_http": "192.168.0.100:2000", - "NOMAD_IP_web_main": "192.168.0.100", + "NOMAD_IP_web_admin": "192.168.0.100", "NOMAD_IP_web_http": "192.168.0.100", "NOMAD_PORT_web_http": "2000", - "NOMAD_PORT_web_main": "5000", + "NOMAD_PORT_web_admin": "5000", "NOMAD_IP_admin": "1.2.3.4", "NOMAD_PORT_admin": "8081", "NOMAD_HOST_PORT_admin": "8081",