-
Notifications
You must be signed in to change notification settings - Fork 2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Nomad agent does not notify the task, about the change of dynamic secrets after restart #4226
Comments
@schmichael We have the same issue with dynamic secrets when nomad agent restarts due upgrade process. Of course we can do node drain but it is not very convenient, also this doesn't cover cases when nomad crash due for example internal bug and restarted by systemd, upstart etc |
We made patch to solve this: client/alloc_runner.go | 10 +++++-
client/consul_template.go | 80 +++++++++++++++++++++++++++++++++++++++++++----
client/task_runner.go | 19 ++++++-----
3 files changed, 95 insertions(+), 14 deletions(-)
diff --git a/client/alloc_runner.go b/client/alloc_runner.go
index aeb284a3b..8ec141df4 100644
--- a/client/alloc_runner.go
+++ b/client/alloc_runner.go
@@ -29,6 +29,12 @@ var (
allocRunnerStateAllocDirKey = []byte("alloc-dir")
)
+type TemplateManagerLogger struct{
+ logger *log.Logger
+ allocID string
+ taskName string
+}
+
// AllocStateUpdater is used to update the status of an allocation
type AllocStateUpdater func(alloc *structs.Allocation)
@@ -351,8 +357,10 @@ func (r *AllocRunner) RestoreState() error {
r.logger.Printf("[ERR] client: failed to restore state for alloc %s task %q: %v", r.allocID, name, err)
mErr.Errors = append(mErr.Errors, err)
} else if !r.alloc.TerminalStatus() {
+ ctx := context.WithValue(context.Background(), "logger", &TemplateManagerLogger{r.logger, r.allocID, name})
+
// Only start if the alloc isn't in a terminal status.
- go tr.Run()
+ go tr.RunWithContext(ctx)
if upgrading {
if err := tr.SaveState(); err != nil {
diff --git a/client/consul_template.go b/client/consul_template.go
index 3e4869706..0a53c622a 100644
--- a/client/consul_template.go
+++ b/client/consul_template.go
@@ -8,6 +8,7 @@ import (
"sort"
"strconv"
"strings"
+ "context"
"sync"
"time"
@@ -131,7 +132,7 @@ func (c *TaskTemplateManagerConfig) Validate() error {
return nil
}
-func NewTaskTemplateManager(config *TaskTemplateManagerConfig) (*TaskTemplateManager, error) {
+func NewTaskTemplateManager(ctx context.Context, config *TaskTemplateManagerConfig) (*TaskTemplateManager, error) {
// Check pre-conditions
if err := config.Validate(); err != nil {
return nil, err
@@ -168,7 +169,7 @@ func NewTaskTemplateManager(config *TaskTemplateManagerConfig) (*TaskTemplateMan
tm.runner = runner
tm.lookup = lookup
- go tm.run()
+ go tm.run(ctx)
return tm, nil
}
@@ -191,7 +192,7 @@ func (tm *TaskTemplateManager) Stop() {
}
// run is the long lived loop that handles errors and templates being rendered
-func (tm *TaskTemplateManager) run() {
+func (tm *TaskTemplateManager) run(ctx context.Context) {
// Runner is nil if there is no templates
if tm.runner == nil {
// Unblock the start if there is nothing to do
@@ -203,7 +204,18 @@ func (tm *TaskTemplateManager) run() {
go tm.runner.Start()
// Block till all the templates have been rendered
- tm.handleFirstRender()
+ events := tm.handleFirstRender()
+ if ctx != nil {
+ if events != nil {
+ if v := ctx.Value("logger"); v != nil {
+ if logger, ok := v.(*TemplateManagerLogger); ok {
+ logger.logger.Printf("[INFO] Calling signals for alloc %s task %s: after restore due templates rerender", logger.allocID, logger.taskName)
+ }
+ }
+
+ tm.CallSignals(events)
+ }
+ }
// Detect if there was a shutdown.
select {
@@ -233,9 +245,10 @@ func (tm *TaskTemplateManager) run() {
}
// handleFirstRender blocks till all templates have been rendered
-func (tm *TaskTemplateManager) handleFirstRender() {
+func (tm *TaskTemplateManager) handleFirstRender() map[string]*manager.RenderEvent {
// missingDependencies is the set of missing dependencies.
var missingDependencies map[string]struct{}
+ var l_retevents map[string]*manager.RenderEvent
// eventTimer is used to trigger the firing of an event showing the missing
// dependencies.
@@ -253,7 +266,7 @@ WAIT:
for {
select {
case <-tm.shutdownCh:
- return
+ return nil
case err, ok := <-tm.runner.ErrCh:
if !ok {
continue
@@ -276,6 +289,8 @@ WAIT:
}
}
+ l_retevents = events
+
break WAIT
case <-tm.runner.RenderEventCh():
events := tm.runner.RenderEvents()
@@ -340,6 +355,59 @@ WAIT:
tm.config.Hooks.EmitEvent(consulTemplateSourceName, fmt.Sprintf("Missing: %s", missingStr))
}
}
+
+ return l_retevents
+}
+
+func (tm *TaskTemplateManager) CallSignals(events map[string]*manager.RenderEvent) {
+ for id, event := range events {
+ if !event.DidRender {
+ continue
+ }
+
+ // Lookup the template and determine what to do
+ tmpls, ok := tm.lookup[id]
+ if !ok {
+ tm.config.Hooks.Kill(consulTemplateSourceName, fmt.Sprintf("template runner returned unknown template id %q", id), true)
+ break
+ }
+
+ signals := make(map[string]struct{})
+ restart := false
+
+ for _, tmpl := range tmpls {
+ switch tmpl.ChangeMode {
+ case structs.TemplateChangeModeSignal:
+ signals[tmpl.ChangeSignal] = struct{}{}
+ case structs.TemplateChangeModeRestart:
+ restart = true
+ case structs.TemplateChangeModeNoop:
+ continue
+ }
+ }
+
+ if restart {
+ const failure = false
+ tm.config.Hooks.Restart(consulTemplateSourceName, "template with change_mode restart re-rendered", failure)
+ } else if len(signals) != 0 {
+ var mErr multierror.Error
+ for signal := range signals {
+ err := tm.config.Hooks.Signal(consulTemplateSourceName, "template re-rendered", tm.signals[signal])
+ if err != nil {
+ multierror.Append(&mErr, err)
+ }
+ }
+
+ if err := mErr.ErrorOrNil(); err != nil {
+ flat := make([]os.Signal, 0, len(signals))
+ for signal := range signals {
+ flat = append(flat, tm.signals[signal])
+ }
+
+ tm.config.Hooks.Kill(consulTemplateSourceName, fmt.Sprintf("Sending signals %v failed: %v", flat, err), true)
+ }
+ }
+ }
}
// handleTemplateRerenders is used to handle template render events after they
diff --git a/client/task_runner.go b/client/task_runner.go
index 4affba3e6..03a1aa2b8 100644
--- a/client/task_runner.go
+++ b/client/task_runner.go
@@ -13,6 +13,7 @@ import (
"strings"
"sync"
"time"
+ "context"
metrics "github.com/armon/go-metrics"
"github.com/boltdb/bolt"
@@ -575,7 +576,7 @@ func (r *TaskRunner) createDriver() (driver.Driver, error) {
}
// Run is a long running routine used to manage the task
-func (r *TaskRunner) Run() {
+func (r *TaskRunner) RunWithContext(ctx context.Context) {
defer close(r.waitCh)
r.logger.Printf("[DEBUG] client: starting task context for '%s' (alloc '%s')",
r.task.Name, r.alloc.ID)
@@ -622,7 +623,7 @@ func (r *TaskRunner) Run() {
}
// Start the run loop
- r.run()
+ r.run(ctx)
// Do any cleanup necessary
r.postrun()
@@ -630,6 +631,10 @@ func (r *TaskRunner) Run() {
return
}
+func (r *TaskRunner) Run() {
+ r.RunWithContext(nil)
+}
+
// validateTask validates the fields of the task and returns an error if the
// task is invalid.
func (r *TaskRunner) validateTask() error {
@@ -911,7 +916,7 @@ func (r *TaskRunner) updatedTokenHandler() {
// Create a new templateManager
var err error
- r.templateManager, err = NewTaskTemplateManager(&TaskTemplateManagerConfig{
+ r.templateManager, err = NewTaskTemplateManager(nil, &TaskTemplateManagerConfig{
Hooks: r,
Templates: r.task.Templates,
ClientConfig: r.config,
@@ -936,7 +941,7 @@ func (r *TaskRunner) updatedTokenHandler() {
// prestart handles life-cycle tasks that occur before the task has started.
// Since it's run asynchronously with the main Run() loop the alloc & task are
// passed in to avoid racing with updates.
-func (r *TaskRunner) prestart(alloc *structs.Allocation, task *structs.Task, resultCh chan bool) {
+func (r *TaskRunner) prestart(ctx context.Context, alloc *structs.Allocation, task *structs.Task, resultCh chan bool) {
if task.Vault != nil {
// Wait for the token
r.logger.Printf("[DEBUG] client: waiting for Vault token for task %v in alloc %q", task.Name, alloc.ID)
@@ -1027,7 +1032,7 @@ func (r *TaskRunner) prestart(alloc *structs.Allocation, task *structs.Task, res
// Build the template manager
if r.templateManager == nil {
var err error
- r.templateManager, err = NewTaskTemplateManager(&TaskTemplateManagerConfig{
+ r.templateManager, err = NewTaskTemplateManager(ctx, &TaskTemplateManagerConfig{
Hooks: r,
Templates: r.task.Templates,
ClientConfig: r.config,
@@ -1083,7 +1088,7 @@ func (r *TaskRunner) postrun() {
// run is the main run loop that handles starting the application, destroying
// it, restarts and signals.
-func (r *TaskRunner) run() {
+func (r *TaskRunner) run(ctx context.Context) {
// Predeclare things so we can jump to the RESTART
var stopCollection chan struct{}
var handleWaitCh chan *dstructs.WaitResult
@@ -1101,7 +1106,7 @@ func (r *TaskRunner) run() {
for {
// Do the prestart activities
prestartResultCh := make(chan bool, 1)
- go r.prestart(r.alloc, r.task, prestartResultCh)
+ go r.prestart(ctx, r.alloc, r.task, prestartResultCh)
WAIT:
for { |
@schmichael please take in account that this issue still present in nomad 0.9.3 And when we restart nomad agent(on ubuntu we use
but allocation doesn't restarted |
We got bitten by this same issue (described in #6638 ). Our application kept using old database username/password after restarting Nomad client agent, and failed to connect to database when Vault expired the old database user. This is really a serious problem for Nomad jobs working with Vault. Especially now when this wonderful 0.10 version coming out, I guess a lot of people will manually restart their Nomad agent recently... |
We encountered the same issue. It took us a while (several nomad versions) to figure out what was going on, since our max TTL was set pretty high and we could never correlate it to a nomad agent restart. All tests are positive. Nomad v0.10.2 |
Hi @schmichael @tgross, any chance this will be picked up in a coming release? This makes using dynamic vault credentials in nomad very difficult and unstable. We're running a few hacky workarounds for the moment, but we can't do that sustainably. Thanks! |
Same here. My client ran into this and is now asking questions around the reliability of the stack and if we should not look for alternatives. |
Still the case for version 0.11.0. I would also love to see this bug fixed. |
@filip-vt What is the workaround you are referring to? |
I my case, I updated a key (v1 kv) in Vault, but the secrets files are not updating, and the job is therefore also not restarted. Even when I restart the task manually from the UI, it still does not renew the secret. template {
data = "{{with secret \"secret/kv/certificate/domain\"}}{{.Data.privkey}}{{end}}"
destination = "secrets/cert.key"
change_mode = "signal"
change_signal = "SIGHUP"
}
template {
data = "{{with secret \"secret/kv/certificate/domain\"}}{{.Data.fullchain}}{{end}}"
destination = "secrets/cert.crt"
change_mode = "signal"
change_signal = "SIGHUP"
} |
I'm seeing the same problem with consul kv values. I followed the example that schmichael gave in #5191 but my task is not getting restarted on @schmichael can you please verify that your example still works? I'm using nomad v0.11.2. I verified that the template file itself is updated, but a task restart is not initiated. |
We are experiencing this issue as well, our dynamic secrets gets updated in the alloc secrets file but the container itself doesn't get updated. We are currently using:Nomad v0.10.2 |
Fixed (finally) in #9636, which will ship in the upcoming Nomad 1.0.2. |
I'm going to lock this issue because it has been closed for 120 days ⏳. This helps our maintainers find and focus on the active issues. |
Nomad version
Nomad 0.8.1
Vault 0.10.0
Operating system and Environment details
Ubuntu 16.04.4 LTS
Issue
After restarting the nomad agent (nomad restart) on which the task is placed, the nomad agent receives new dynamic secrets from the Vault, but does not say anything to the task.
Reproduction steps
Nomad agent will immediately receive a new dynamic secret, but the task will continue to work with the outdated secret. Nomad agent will not send a signal or restart
Nomad Client logs (if appropriate)
Job file (if appropriate)
The text was updated successfully, but these errors were encountered: