-
Notifications
You must be signed in to change notification settings - Fork 2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
client: improve group service stanza interpolation and check_restart support #6586
Changes from all commits
32189ac
7569bd2
022a016
02a9b54
d408876
687aacf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,7 @@ import ( | |
cstate "github.com/hashicorp/nomad/client/state" | ||
cstructs "github.com/hashicorp/nomad/client/structs" | ||
"github.com/hashicorp/nomad/client/vaultclient" | ||
agentconsul "github.com/hashicorp/nomad/command/agent/consul" | ||
"github.com/hashicorp/nomad/helper" | ||
"github.com/hashicorp/nomad/nomad/structs" | ||
"github.com/hashicorp/nomad/plugins/device" | ||
|
@@ -1001,6 +1002,39 @@ func (ar *allocRunner) RestartTask(taskName string, taskEvent *structs.TaskEvent | |
return tr.Restart(context.TODO(), taskEvent, false) | ||
} | ||
|
||
// Restart satisfies the WorkloadRestarter interface restarts all task runners | ||
// concurrently | ||
func (ar *allocRunner) Restart(ctx context.Context, event *structs.TaskEvent, failure bool) error { | ||
waitCh := make(chan struct{}) | ||
var err *multierror.Error | ||
var errMutex sync.Mutex | ||
|
||
go func() { | ||
var wg sync.WaitGroup | ||
defer close(waitCh) | ||
for tn, tr := range ar.tasks { | ||
wg.Add(1) | ||
go func(taskName string, r agentconsul.WorkloadRestarter) { | ||
defer wg.Done() | ||
e := r.Restart(ctx, event, failure) | ||
if e != nil { | ||
errMutex.Lock() | ||
defer errMutex.Unlock() | ||
err = multierror.Append(err, fmt.Errorf("failed to restart task %s: %v", taskName, e)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We do this pattern elsewhere. Might be nice to create an errgroup like package on top of multierror that runs all goroutines and returns all errors, unlike errgroup which only returns the first error and cancels the other goroutines. Not a blocker and would make a nice followup PR if you want to chase it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I made a PR to multierror for this pattern. Will open up a followup Nomad PR to make use of it once I get it merged. hashicorp/go-multierror#28 |
||
} | ||
}(tn, tr) | ||
} | ||
wg.Wait() | ||
}() | ||
|
||
select { | ||
case <-waitCh: | ||
case <-ctx.Done(): | ||
} | ||
|
||
return err.ErrorOrNil() | ||
} | ||
|
||
// RestartAll signalls all task runners in the allocation to restart and passes | ||
// a copy of the task event to each restart event. | ||
// Returns any errors in a concatenated form. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,30 +3,63 @@ package allocrunner | |
import ( | ||
"sync" | ||
|
||
hclog "github.com/hashicorp/go-hclog" | ||
log "github.com/hashicorp/go-hclog" | ||
"github.com/hashicorp/nomad/client/allocrunner/interfaces" | ||
"github.com/hashicorp/nomad/client/consul" | ||
"github.com/hashicorp/nomad/client/taskenv" | ||
agentconsul "github.com/hashicorp/nomad/command/agent/consul" | ||
"github.com/hashicorp/nomad/nomad/structs" | ||
"github.com/hashicorp/nomad/plugins/drivers" | ||
) | ||
|
||
// groupServiceHook manages task group Consul service registration and | ||
// deregistration. | ||
type groupServiceHook struct { | ||
alloc *structs.Allocation | ||
allocID string | ||
group string | ||
restarter agentconsul.WorkloadRestarter | ||
consulClient consul.ConsulServiceAPI | ||
prerun bool | ||
mu sync.Mutex | ||
|
||
logger log.Logger | ||
|
||
// The following fields may be updated | ||
canary bool | ||
services []*structs.Service | ||
networks structs.Networks | ||
taskEnvBuilder *taskenv.Builder | ||
|
||
// Since Update() may be called concurrently with any other hook all | ||
// hook methods must be fully serialized | ||
mu sync.Mutex | ||
} | ||
|
||
type groupServiceHookConfig struct { | ||
alloc *structs.Allocation | ||
consul consul.ConsulServiceAPI | ||
restarter agentconsul.WorkloadRestarter | ||
taskEnvBuilder *taskenv.Builder | ||
logger log.Logger | ||
} | ||
|
||
func newGroupServiceHook(logger hclog.Logger, alloc *structs.Allocation, consulClient consul.ConsulServiceAPI) *groupServiceHook { | ||
func newGroupServiceHook(cfg groupServiceHookConfig) *groupServiceHook { | ||
h := &groupServiceHook{ | ||
alloc: alloc, | ||
consulClient: consulClient, | ||
allocID: cfg.alloc.ID, | ||
group: cfg.alloc.TaskGroup, | ||
restarter: cfg.restarter, | ||
consulClient: cfg.consul, | ||
taskEnvBuilder: cfg.taskEnvBuilder, | ||
} | ||
h.logger = cfg.logger.Named(h.Name()) | ||
h.services = cfg.alloc.Job.LookupTaskGroup(h.group).Services | ||
|
||
if cfg.alloc.AllocatedResources != nil { | ||
h.networks = cfg.alloc.AllocatedResources.Shared.Networks | ||
} | ||
|
||
if cfg.alloc.DeploymentStatus != nil { | ||
h.canary = cfg.alloc.DeploymentStatus.Canary | ||
} | ||
h.logger = logger.Named(h.Name()) | ||
return h | ||
} | ||
|
||
|
@@ -41,26 +74,97 @@ func (h *groupServiceHook) Prerun() error { | |
h.prerun = true | ||
h.mu.Unlock() | ||
}() | ||
return h.consulClient.RegisterGroup(h.alloc) | ||
|
||
if len(h.services) == 0 { | ||
return nil | ||
} | ||
|
||
services := h.getWorkloadServices() | ||
return h.consulClient.RegisterWorkload(services) | ||
} | ||
|
||
func (h *groupServiceHook) Update(req *interfaces.RunnerUpdateRequest) error { | ||
h.mu.Lock() | ||
defer h.mu.Unlock() | ||
oldAlloc := h.alloc | ||
h.alloc = req.Alloc | ||
oldWorkloadServices := h.getWorkloadServices() | ||
|
||
// Store new updated values out of request | ||
canary := false | ||
if req.Alloc.DeploymentStatus != nil { | ||
canary = req.Alloc.DeploymentStatus.Canary | ||
} | ||
|
||
var networks structs.Networks | ||
if req.Alloc.AllocatedResources != nil { | ||
networks = req.Alloc.AllocatedResources.Shared.Networks | ||
} | ||
|
||
// Update group service hook fields | ||
h.networks = networks | ||
h.services = req.Alloc.Job.LookupTaskGroup(h.group).Services | ||
h.canary = canary | ||
h.taskEnvBuilder.UpdateTask(req.Alloc, nil) | ||
|
||
// Create new task services struct with those new values | ||
newWorkloadServices := h.getWorkloadServices() | ||
|
||
if !h.prerun { | ||
// Update called before Prerun. Update alloc and exit to allow | ||
// Prerun to do initial registration. | ||
return nil | ||
} | ||
|
||
return h.consulClient.UpdateGroup(oldAlloc, h.alloc) | ||
return h.consulClient.UpdateWorkload(oldWorkloadServices, newWorkloadServices) | ||
} | ||
|
||
func (h *groupServiceHook) Postrun() error { | ||
h.mu.Lock() | ||
defer h.mu.Unlock() | ||
return h.consulClient.RemoveGroup(h.alloc) | ||
h.deregister() | ||
return nil | ||
} | ||
|
||
func (h *groupServiceHook) driverNet() *drivers.DriverNetwork { | ||
if len(h.networks) == 0 { | ||
return nil | ||
} | ||
|
||
//TODO(schmichael) only support one network for now | ||
net := h.networks[0] | ||
//TODO(schmichael) there's probably a better way than hacking driver network | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I eagerly await the day these comments are in red. 😁 |
||
return &drivers.DriverNetwork{ | ||
AutoAdvertise: true, | ||
IP: net.IP, | ||
// Copy PortLabels from group network | ||
PortMap: net.PortLabels(), | ||
} | ||
} | ||
|
||
// deregister services from Consul. | ||
func (h *groupServiceHook) deregister() { | ||
if len(h.services) > 0 { | ||
workloadServices := h.getWorkloadServices() | ||
h.consulClient.RemoveWorkload(workloadServices) | ||
|
||
// Canary flag may be getting flipped when the alloc is being | ||
// destroyed, so remove both variations of the service | ||
workloadServices.Canary = !workloadServices.Canary | ||
h.consulClient.RemoveWorkload(workloadServices) | ||
} | ||
} | ||
|
||
func (h *groupServiceHook) getWorkloadServices() *agentconsul.WorkloadServices { | ||
nickethier marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// Interpolate with the task's environment | ||
interpolatedServices := taskenv.InterpolateServices(h.taskEnvBuilder.Build(), h.services) | ||
|
||
// Create task services struct with request's driver metadata | ||
return &agentconsul.WorkloadServices{ | ||
AllocID: h.allocID, | ||
Group: h.group, | ||
Restarter: h.restarter, | ||
Services: interpolatedServices, | ||
DriverNetwork: h.driverNet(), | ||
Networks: h.networks, | ||
Canary: h.canary, | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note to reviews: This might be overkill but I wanted restarts to be as quick as possible especially when there are numerous tasks in the group. I'd like to hear your thoughts.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this replace the
RestartAll
method below?Also, what happens when we're in the middle of this
Restart
but we get a call to update or kill? The alloc runner hooks get their actions serialized by a lock butallocRunner
has a lot of "make sure you call this after/before this other method". (Although this is probably out of scope for this PR because that's the existing condition of things here.)