-
Notifications
You must be signed in to change notification settings - Fork 2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
identity: Implement change_mode
#18943
Changes from all commits
9912052
9a4cf5f
3332a2b
3f0f4d1
d65330b
1a977e5
86d0e97
8934958
616ca79
33f15cd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
```release-note:improvement | ||
identity: Implement `change_mode` and `change_signal` for workload identities | ||
``` |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,10 +7,13 @@ import ( | |
"context" | ||
"fmt" | ||
"path/filepath" | ||
"time" | ||
|
||
"github.com/hashicorp/consul-template/signals" | ||
log "github.com/hashicorp/go-hclog" | ||
|
||
"github.com/hashicorp/nomad/client/allocrunner/interfaces" | ||
ti "github.com/hashicorp/nomad/client/allocrunner/taskrunner/interfaces" | ||
"github.com/hashicorp/nomad/client/taskenv" | ||
"github.com/hashicorp/nomad/client/widmgr" | ||
"github.com/hashicorp/nomad/helper/users" | ||
|
@@ -37,6 +40,7 @@ type identityHook struct { | |
task *structs.Task | ||
tokenDir string | ||
envBuilder *taskenv.Builder | ||
lifecycle ti.TaskLifecycle | ||
ts tokenSetter | ||
widmgr widmgr.IdentityManager | ||
logger log.Logger | ||
|
@@ -52,6 +56,7 @@ func newIdentityHook(tr *TaskRunner, logger log.Logger) *identityHook { | |
task: tr.Task(), | ||
tokenDir: tr.taskDir.SecretsDir, | ||
envBuilder: tr.envBuilder, | ||
lifecycle: tr, | ||
ts: tr, | ||
widmgr: tr.widmgr, | ||
stopCtx: stopCtx, | ||
|
@@ -65,52 +70,138 @@ func (*identityHook) Name() string { | |
return "identity" | ||
} | ||
|
||
func (h *identityHook) Prestart(context.Context, *interfaces.TaskPrestartRequest, *interfaces.TaskPrestartResponse) error { | ||
func (h *identityHook) Prestart(ctx context.Context, _ *interfaces.TaskPrestartRequest, _ *interfaces.TaskPrestartResponse) error { | ||
|
||
// Handle default workload identity | ||
if err := h.setDefaultToken(); err != nil { | ||
return err | ||
} | ||
|
||
// Track first run signals from watchers | ||
firstRunCh := make(chan struct{}, len(h.task.Identities)) | ||
|
||
// Start token watcher loops | ||
for _, widspec := range h.task.Identities { | ||
w := widspec | ||
go h.watchIdentity(w) | ||
go h.watchIdentity(w, firstRunCh) | ||
} | ||
|
||
// Don't block indefinitely for identities | ||
deadlineTimer := time.NewTimer(time.Minute) | ||
defer deadlineTimer.Stop() | ||
|
||
// Wait until every watcher ticks the first run chan | ||
for i := range h.task.Identities { | ||
select { | ||
case <-firstRunCh: | ||
// Identity fetched, loop | ||
case <-deadlineTimer.C: | ||
h.logger.Warn("timed out waiting for initial identity tokens to be fetched", | ||
"num_fetched", i, "num_total", len(h.task.Identities)) | ||
return nil | ||
Comment on lines
+98
to
+101
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The group-level This hook is early in the taskrunner so returning an error early will also prevent us from doing much more expensive setup work only to throw it away (ex. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (That being said, I wouldn't block this PR on this.) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah that makes a ton more sense. So the only threat here is for Clients so busy it can't pluck some structs off a chan and throw them on disk (if file=true) in time. No racing the network or remote machines involved. When nodes are rebooted they can be quite busy so I am happy to have this race fixed. 1 minute does seem like an eternity though, so perhaps short-circuiting with an error is better than making a best-effort by letting it trundle on. 🤔 It ought to be sufficiently unlikely to not matter, so I don't think I'm going to write up an issue until someone hits it? 🤔 |
||
case <-ctx.Done(): | ||
h.logger.Debug("task prestart cancelled before initial identity tokens were fetched", | ||
"num_fetched", i, "num_total", len(h.task.Identities)) | ||
return nil | ||
case <-h.stopCtx.Done(): | ||
h.logger.Debug("task stopped before initial identity tokens were fetched", | ||
"num_fetched", i, "num_total", len(h.task.Identities)) | ||
return nil | ||
} | ||
} | ||
|
||
return nil | ||
} | ||
|
||
func (h *identityHook) watchIdentity(wid *structs.WorkloadIdentity) { | ||
func (h *identityHook) watchIdentity(wid *structs.WorkloadIdentity, runCh chan struct{}) { | ||
id := structs.WIHandle{WorkloadIdentifier: h.task.Name, IdentityName: wid.Name} | ||
signedIdentitiesChan, stopWatching := h.widmgr.Watch(id) | ||
defer stopWatching() | ||
|
||
firstRun := true | ||
|
||
for { | ||
select { | ||
case signedWID, ok := <-signedIdentitiesChan: | ||
h.logger.Trace("receiving renewed identity", "identity_name", wid.Name) | ||
h.logger.Trace("receiving renewed identity", "identity", wid.Name) | ||
if !ok { | ||
// Chan was closed, stop watching | ||
h.logger.Trace("identity watch closed", "task", h.task.Name, "identity", wid.Name) | ||
h.logger.Trace("identity watch closed", "identity", wid.Name) | ||
return | ||
} | ||
|
||
if signedWID == nil { | ||
// The only way to hit this should be a bug as it indicates the server | ||
// did not sign an identity for a task on this alloc. | ||
h.logger.Error("missing workload identity %q", wid.Name) | ||
return | ||
} | ||
|
||
if err := h.setAltToken(wid, signedWID.JWT); err != nil { | ||
h.logger.Error(err.Error()) | ||
} | ||
|
||
// Skip ChangeMode on firstRun and notify caller it can proceed | ||
if firstRun { | ||
select { | ||
case runCh <- struct{}{}: | ||
default: | ||
// Not great but not necessarily fatal | ||
h.logger.Warn("task started before identity %q was fetched", wid.Name) | ||
} | ||
|
||
firstRun = false | ||
continue | ||
} | ||
|
||
switch wid.ChangeMode { | ||
case structs.WIChangeModeRestart: | ||
const noFailure = false | ||
err := h.lifecycle.Restart(h.stopCtx, structs.NewTaskEvent(structs.TaskRestartSignal). | ||
SetDisplayMessage(fmt.Sprintf("Identity[%s]: new token acquired", wid.Name)), noFailure) | ||
if err != nil { | ||
// Ignore error from kill because if that fails there's really | ||
// nothing to be done. | ||
_ = h.lifecycle.Kill(h.stopCtx, structs.NewTaskEvent(structs.TaskKilling). | ||
SetFailsTask(). | ||
SetDisplayMessage(fmt.Sprintf("Identity[%s]: failed to restart: %v", wid.Name, err))) | ||
return | ||
} | ||
|
||
case structs.WIChangeModeSignal: | ||
if err := h.signalTask(wid); err != nil { | ||
h.logger.Error("failed to send signal", "identity", wid.Name, "signal", wid.ChangeSignal) | ||
// Ignore error from kill because if that fails there's really | ||
// nothing to be done. | ||
_ = h.lifecycle.Kill(h.stopCtx, structs.NewTaskEvent(structs.TaskKilling). | ||
SetFailsTask(). | ||
SetDisplayMessage(fmt.Sprintf("Identity[%s]: failed to send signal: %v", wid.Name, err))) | ||
return | ||
} | ||
|
||
} | ||
|
||
// Note: any code added here will not run on first run | ||
|
||
case <-h.stopCtx.Done(): | ||
return | ||
} | ||
} | ||
} | ||
|
||
// signalTask sends the configured signal to a task or returns an error. | ||
func (h *identityHook) signalTask(wid *structs.WorkloadIdentity) error { | ||
s, err := signals.Parse(wid.ChangeSignal) | ||
if err != nil { | ||
return fmt.Errorf("failed to parse signal: %w", err) | ||
} | ||
|
||
event := structs.NewTaskEvent(structs.TaskSignaling). | ||
SetTaskSignal(s). | ||
SetDisplayMessage(fmt.Sprintf("Identity[%s]: new Identity token acquired", wid.Name)) | ||
return h.lifecycle.Signal(event, wid.ChangeSignal) | ||
} | ||
|
||
// setDefaultToken adds the Nomad token to the task's environment and writes it to a | ||
// file if requested by the jobsepc. | ||
func (h *identityHook) setDefaultToken() error { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
kind of makes me think there should be a context-aware version of WaitGroup
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
100% what I needed (or a WaitGroup where Wait() returns a chan)