From feaf6214f992d9140a42b946b08a57d8b22ed051 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Tue, 30 Oct 2018 09:08:53 -0400 Subject: [PATCH 01/14] Set User-Agent header when hitting Vault API --- client/vaultclient/vaultclient.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/client/vaultclient/vaultclient.go b/client/vaultclient/vaultclient.go index e08bd33eaaf..94f350cbd5b 100644 --- a/client/vaultclient/vaultclient.go +++ b/client/vaultclient/vaultclient.go @@ -4,6 +4,7 @@ import ( "container/heap" "fmt" "math/rand" + "net/http" "strings" "sync" "time" @@ -157,6 +158,10 @@ func NewVaultClient(config *config.VaultConfig, logger hclog.Logger, tokenDerive return nil, err } + client.SetHeaders(http.Header{ + "User-Agent": []string{"HashiCorp/nomad"}, + }) + c.client = client return c, nil From 05d9b399423824b438103db5e77a7c4484afd160 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Tue, 30 Oct 2018 10:19:25 -0400 Subject: [PATCH 02/14] Set a 1s floor for Vault renew operation backoff --- nomad/vault.go | 73 +++++++++++++++++++++++++++------------------ nomad/vault_test.go | 39 ++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 29 deletions(-) diff --git a/nomad/vault.go b/nomad/vault.go index 91fc30df0c5..b159f061184 100644 --- a/nomad/vault.go +++ b/nomad/vault.go @@ -491,33 +491,9 @@ func (v *vaultClient) renewalLoop() { break } - // Back off, increasing the amount of backoff each time. There are some rules: - // - // * If we have an existing authentication that is going to expire, - // never back off more than half of the amount of time remaining - // until expiration - // * Never back off more than 30 seconds multiplied by a random - // value between 1 and 2 - // * Use randomness so that many clients won't keep hitting Vault - // at the same time - - // Set base values and add some backoff - v.logger.Warn("got error or bad auth, so backing off", "error", err) - switch { - case backoff < 5: - backoff = 5 - case backoff >= 24: - backoff = 30 - default: - backoff = backoff * 1.25 - } - - // Add randomness - backoff = backoff * (1.0 + rand.Float64()) - - maxBackoff := currentExpiration.Sub(time.Now()) / 2 - if maxBackoff < 0 { + backoff = nextBackoff(backoff, currentExpiration) + if backoff < 0 { // We have failed to renew the token past its expiration. Stop // renewing with Vault. v.logger.Error("failed to renew Vault token before lease expiration. Shutting down Vault client") @@ -526,9 +502,6 @@ func (v *vaultClient) renewalLoop() { v.connEstablishedErr = err v.l.Unlock() return - - } else if backoff > maxBackoff.Seconds() { - backoff = maxBackoff.Seconds() } durationUntilRetry := time.Duration(backoff) * time.Second @@ -539,6 +512,48 @@ func (v *vaultClient) renewalLoop() { } } +// nextBackoff returns the delay for the next auto renew interval, in seconds. +// Returns negative value if past expiration +// +// It should increaes the amount of backoff each time, with the following rules: +// +// * If we have an existing authentication that is going to expire, +// never back off more than half of the amount of time remaining +// until expiration (with 1s floor) +// * Never back off more than 30 seconds multiplied by a random +// value between 1 and 2 +// * Use randomness so that many clients won't keep hitting Vault +// at the same time +func nextBackoff(backoff float64, expiry time.Time) float64 { + maxBackoff := time.Until(expiry) / 2 + if maxBackoff < 0 { + return -1 + } + + switch { + case backoff < 5: + backoff = 5 + case backoff >= 24: + backoff = 30 + default: + backoff = backoff * 1.25 + } + + // Add randomness + backoff = backoff * (1.0 + rand.Float64()) + + if backoff > maxBackoff.Seconds() { + backoff = maxBackoff.Seconds() + } + + // avoid hammering Vault + if backoff < 1 { + backoff = 1 + } + + return backoff +} + // renew attempts to renew our Vault token. If the renewal fails, an error is // returned. This method updates the lastRenewed time func (v *vaultClient) renew() error { diff --git a/nomad/vault_test.go b/nomad/vault_test.go index f7fdbddb82f..191eea9df6b 100644 --- a/nomad/vault_test.go +++ b/nomad/vault_test.go @@ -1258,3 +1258,42 @@ func waitForConnection(v *vaultClient, t *testing.T) { t.Fatalf("Connection not established") }) } + +func TestVaultClient_nextBackoff(t *testing.T) { + simpleCases := []struct { + name string + initBackoff float64 + + // define range of acceptable backoff values accounting for random factor + rangeMin float64 + rangeMax float64 + }{ + {"simple case", 7.0, 8.7, 17.60}, + {"too low", 2.0, 5.0, 10.0}, + {"too large", 100, 30.0, 60.0}, + } + + for _, c := range simpleCases { + t.Run(c.name, func(t *testing.T) { + b := nextBackoff(c.initBackoff, time.Now().Add(10*time.Hour)) + if !(c.rangeMin <= b && b <= c.rangeMax) { + t.Fatalf("Expected backoff within [%v, %v] but found %v", c.rangeMin, c.rangeMax, b) + } + }) + } + + // some edge cases + t.Run("close to expiry", func(t *testing.T) { + b := nextBackoff(20, time.Now().Add(1100*time.Millisecond)) + if !(1.0 <= b && b <= 3) { + t.Fatalf("Expected backoff within [1, 3] but found %v", b) + } + }) + + t.Run("past expiry", func(t *testing.T) { + b := nextBackoff(20, time.Now().Add(-1100*time.Millisecond)) + if b >= 0.0 { + t.Fatalf("Expected backoff to be negative but found %v", b) + } + }) +} From 88c1698ef50b09edebb0f4d53245d7a80089175c Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Tue, 30 Oct 2018 10:53:19 -0400 Subject: [PATCH 03/14] Emit metric counters for Vault token and renewal failures --- client/vaultclient/vaultclient.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/client/vaultclient/vaultclient.go b/client/vaultclient/vaultclient.go index 94f350cbd5b..7494fb2c932 100644 --- a/client/vaultclient/vaultclient.go +++ b/client/vaultclient/vaultclient.go @@ -9,6 +9,7 @@ import ( "sync" "time" + "github.com/armon/go-metrics" hclog "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/nomad/structs/config" @@ -303,6 +304,7 @@ func (c *vaultClient) RenewToken(token string, increment int) (<-chan error, err // error channel. if err := c.renew(renewalReq); err != nil { c.logger.Error("error during renewal of token", "error", err) + metrics.IncrCounter([]string{"client", "vault", "renew_token_failure"}, 1) return nil, err } @@ -340,6 +342,7 @@ func (c *vaultClient) RenewLease(leaseId string, increment int) (<-chan error, e // Renew the secret and send any error to the dedicated error channel if err := c.renew(renewalReq); err != nil { c.logger.Error("error during renewal of lease", "error", err) + metrics.IncrCounter([]string{"client", "vault", "renew_lease_error"}, 1) return nil, err } @@ -536,6 +539,7 @@ func (c *vaultClient) run() { case <-renewalCh: if err := c.renew(renewalReq); err != nil { c.logger.Error("error renewing token", "error", err) + metrics.IncrCounter([]string{"client", "vault", "renew_token_error"}, 1) } case <-c.updateCh: continue From f181f1a07a9d9bfefbaeaf3793bdcdab040e5751 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Tue, 30 Oct 2018 12:38:32 -0400 Subject: [PATCH 04/14] Wrap Vault API api errors for easing debugging --- nomad/vault.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nomad/vault.go b/nomad/vault.go index b159f061184..5f21081df57 100644 --- a/nomad/vault.go +++ b/nomad/vault.go @@ -560,7 +560,7 @@ func (v *vaultClient) renew() error { // Attempt to renew the token secret, err := v.auth.RenewSelf(v.tokenData.CreationTTL) if err != nil { - return err + return fmt.Errorf("failed to renew the vault token: %v", err) } if secret == nil { // It's possible for RenewSelf to return (nil, nil) if the @@ -914,6 +914,7 @@ func (v *vaultClient) CreateToken(ctx context.Context, a *structs.Allocation, ta // Determine whether it is unrecoverable if err != nil { + err = fmt.Errorf("failed to create an alloc vault token: %v", err) if structs.VaultUnrecoverableError.MatchString(err.Error()) { return secret, err } From 0fc84f4cfb0a0602ef0bca5b88c795aa490da717 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Thu, 1 Nov 2018 08:40:28 -0400 Subject: [PATCH 05/14] address review comments --- client/vaultclient/vaultclient.go | 2 +- nomad/vault.go | 11 ++++------- nomad/vault_test.go | 4 ++-- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/client/vaultclient/vaultclient.go b/client/vaultclient/vaultclient.go index 7494fb2c932..8d9dff28da2 100644 --- a/client/vaultclient/vaultclient.go +++ b/client/vaultclient/vaultclient.go @@ -160,7 +160,7 @@ func NewVaultClient(config *config.VaultConfig, logger hclog.Logger, tokenDerive } client.SetHeaders(http.Header{ - "User-Agent": []string{"HashiCorp/nomad"}, + "User-Agent": []string{"hashicorp/nomad"}, }) c.client = client diff --git a/nomad/vault.go b/nomad/vault.go index 5f21081df57..0033c7f324e 100644 --- a/nomad/vault.go +++ b/nomad/vault.go @@ -515,11 +515,11 @@ func (v *vaultClient) renewalLoop() { // nextBackoff returns the delay for the next auto renew interval, in seconds. // Returns negative value if past expiration // -// It should increaes the amount of backoff each time, with the following rules: +// It should increase the amount of backoff each time, with the following rules: // // * If we have an existing authentication that is going to expire, // never back off more than half of the amount of time remaining -// until expiration (with 1s floor) +// until expiration (with 5s floor) // * Never back off more than 30 seconds multiplied by a random // value between 1 and 2 // * Use randomness so that many clients won't keep hitting Vault @@ -531,8 +531,6 @@ func nextBackoff(backoff float64, expiry time.Time) float64 { } switch { - case backoff < 5: - backoff = 5 case backoff >= 24: backoff = 30 default: @@ -546,9 +544,8 @@ func nextBackoff(backoff float64, expiry time.Time) float64 { backoff = maxBackoff.Seconds() } - // avoid hammering Vault - if backoff < 1 { - backoff = 1 + if backoff < 5 { + backoff = 5 } return backoff diff --git a/nomad/vault_test.go b/nomad/vault_test.go index 191eea9df6b..3de2e079fbd 100644 --- a/nomad/vault_test.go +++ b/nomad/vault_test.go @@ -1285,8 +1285,8 @@ func TestVaultClient_nextBackoff(t *testing.T) { // some edge cases t.Run("close to expiry", func(t *testing.T) { b := nextBackoff(20, time.Now().Add(1100*time.Millisecond)) - if !(1.0 <= b && b <= 3) { - t.Fatalf("Expected backoff within [1, 3] but found %v", b) + if b != 5.0 { + t.Fatalf("Expected backoff is 5 but found %v", b) } }) From a0921cc34ffd304d7bf95e86a0211d9fca732756 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Fri, 16 Nov 2018 13:32:02 -0500 Subject: [PATCH 06/14] More metrics for Server vault Add a gauge to track remaining time-to-live, duration of renewal request API call. --- nomad/vault.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nomad/vault.go b/nomad/vault.go index 0033c7f324e..482000c887c 100644 --- a/nomad/vault.go +++ b/nomad/vault.go @@ -491,6 +491,7 @@ func (v *vaultClient) renewalLoop() { break } + metrics.IncrCounter([]string{"nomad", "vault", "renew_failed"}, 1) v.logger.Warn("got error or bad auth, so backing off", "error", err) backoff = nextBackoff(backoff, currentExpiration) if backoff < 0 { @@ -554,6 +555,9 @@ func nextBackoff(backoff float64, expiry time.Time) float64 { // renew attempts to renew our Vault token. If the renewal fails, an error is // returned. This method updates the lastRenewed time func (v *vaultClient) renew() error { + // Track how long the request takes + defer metrics.MeasureSince([]string{"nomad", "vault", "renew"}, time.Now()) + // Attempt to renew the token secret, err := v.auth.RenewSelf(v.tokenData.CreationTTL) if err != nil { From 887704e7044714341b827c15c000b596fc3971d1 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Fri, 2 Nov 2018 16:10:56 -0400 Subject: [PATCH 07/14] Avoid explicit precomputed stats field Seems like the stats field is a micro-optimization that doesn't justify the complexity it introduces. Removing it and computing the stats from revoking field directly. --- nomad/vault.go | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/nomad/vault.go b/nomad/vault.go index 482000c887c..55f58e227a7 100644 --- a/nomad/vault.go +++ b/nomad/vault.go @@ -209,10 +209,6 @@ type vaultClient struct { tomb *tomb.Tomb logger log.Logger - // stats stores the stats - stats *VaultStats - statsLock sync.RWMutex - // l is used to lock the configuration aspects of the client such that // multiple callers can't cause conflicting config updates l sync.Mutex @@ -236,7 +232,6 @@ func NewVaultClient(c *config.VaultConfig, logger log.Logger, purgeFn PurgeVault revoking: make(map[*structs.VaultAccessor]time.Time), purgeFn: purgeFn, tomb: &tomb.Tomb{}, - stats: new(VaultStats), } if v.config.IsEnabled() { @@ -1052,13 +1047,11 @@ func (v *vaultClient) RevokeTokens(ctx context.Context, accessors []*structs.Vau // time. func (v *vaultClient) storeForRevocation(accessors []*structs.VaultAccessor) { v.revLock.Lock() - v.statsLock.Lock() + now := time.Now() for _, a := range accessors { v.revoking[a] = now.Add(time.Duration(a.CreationTTL) * time.Second) } - v.stats.TrackedForRevoke = len(v.revoking) - v.statsLock.Unlock() v.revLock.Unlock() } @@ -1179,12 +1172,9 @@ func (v *vaultClient) revokeDaemon() { // Can delete from the tracked list now that we have purged v.revLock.Lock() - v.statsLock.Lock() for _, va := range revoking { delete(v.revoking, va) } - v.stats.TrackedForRevoke = len(v.revoking) - v.statsLock.Unlock() v.revLock.Unlock() } @@ -1221,11 +1211,9 @@ func (v *vaultClient) Stats() *VaultStats { // Allocate a new stats struct stats := new(VaultStats) - v.statsLock.RLock() - defer v.statsLock.RUnlock() - - // Copy all the stats - stats.TrackedForRevoke = v.stats.TrackedForRevoke + v.revLock.Lock() + stats.TrackedForRevoke = len(v.revoking) + v.revLock.Unlock() return stats } From 0816841fddad183030237575453a332ac1890f9b Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Fri, 2 Nov 2018 16:28:51 -0400 Subject: [PATCH 08/14] Populate agent-info with vault Return Vault TTL info to /agent/self API and `nomad agent-info` command. --- nomad/server.go | 1 + nomad/vault.go | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/nomad/server.go b/nomad/server.go index a3ea18860cb..89da7a9e4b9 100644 --- a/nomad/server.go +++ b/nomad/server.go @@ -1433,6 +1433,7 @@ func (s *Server) Stats() map[string]map[string]string { "raft": s.raft.Stats(), "serf": s.serf.Stats(), "runtime": stats.RuntimeStats(), + "vault": s.vault.Stats(), } return stats diff --git a/nomad/vault.go b/nomad/vault.go index 55f58e227a7..10b5a6062fd 100644 --- a/nomad/vault.go +++ b/nomad/vault.go @@ -127,7 +127,7 @@ type VaultClient interface { Running() bool // Stats returns the Vault clients statistics - Stats() *VaultStats + Stats() map[string]string // EmitStats emits that clients statistics at the given period until stopCh // is called. @@ -1223,7 +1223,7 @@ func (v *vaultClient) EmitStats(period time.Duration, stopCh chan struct{}) { for { select { case <-time.After(period): - stats := v.Stats() + stats := v.stats() metrics.SetGauge([]string{"nomad", "vault", "distributed_tokens_revoking"}, float32(stats.TrackedForRevoke)) case <-stopCh: return From 612825dd5c350314b9bdfa7589006dd1d127596b Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Fri, 9 Nov 2018 11:19:59 -0500 Subject: [PATCH 09/14] reconcile interface --- nomad/vault_testing.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nomad/vault_testing.go b/nomad/vault_testing.go index d5a361c9e5c..9ac8f30f42e 100644 --- a/nomad/vault_testing.go +++ b/nomad/vault_testing.go @@ -139,5 +139,5 @@ func (v *TestVaultClient) Stop() func (v *TestVaultClient) SetActive(enabled bool) {} func (v *TestVaultClient) SetConfig(config *config.VaultConfig) error { return nil } func (v *TestVaultClient) Running() bool { return true } -func (v *TestVaultClient) Stats() *VaultStats { return new(VaultStats) } +func (v *TestVaultClient) Stats() map[string]string { return map[string]string{} } func (v *TestVaultClient) EmitStats(period time.Duration, stopCh chan struct{}) {} From df0cddda0600586ce4e024c5d07af832e5f471b7 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Fri, 9 Nov 2018 11:34:09 -0500 Subject: [PATCH 10/14] Track renewal expiration properly --- nomad/vault.go | 56 +++++++++++++++++++++++++++++++++------------ nomad/vault_test.go | 47 ++++++++++++++++++++++++++++++++++++- 2 files changed, 87 insertions(+), 16 deletions(-) diff --git a/nomad/vault.go b/nomad/vault.go index 10b5a6062fd..ae0ba953ae7 100644 --- a/nomad/vault.go +++ b/nomad/vault.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "math/rand" + "strconv" "sync" "sync/atomic" "time" @@ -140,6 +141,12 @@ type VaultStats struct { // TrackedForRevoke is the count of tokens that are being tracked to be // revoked since they could not be immediately revoked. TrackedForRevoke int + + // TokenTTL is the time-to-live duration for the current token + TokenTTL time.Duration + + // TokenExpiry Time is the recoreded expiry time of the current token + TokenExpiry time.Time } // PurgeVaultAccessor is called to remove VaultAccessors from the system. If @@ -203,8 +210,8 @@ type vaultClient struct { // childTTL is the TTL for child tokens. childTTL string - // lastRenewed is the time the token was last renewed - lastRenewed time.Time + // currentExpiration is the time the current tokean lease expires + currentExpiration time.Time tomb *tomb.Tomb logger log.Logger @@ -469,13 +476,11 @@ func (v *vaultClient) renewalLoop() { case <-authRenewTimer.C: // Renew the token and determine the new expiration err := v.renew() - currentExpiration := v.lastRenewed.Add(time.Duration(v.tokenData.CreationTTL) * time.Second) + currentExpiration := v.currentExpiration // Successfully renewed if err == nil { - // If we take the expiration (lastRenewed + auth duration) and - // subtract the current time, we get a duration until expiry. - // Set the timer to poke us after half of that time is up. + // Attempt to renew the token at half the expiration time durationUntilRenew := currentExpiration.Sub(time.Now()) / 2 v.logger.Info("successfully renewed token", "next_renewal", durationUntilRenew) @@ -548,7 +553,7 @@ func nextBackoff(backoff float64, expiry time.Time) float64 { } // renew attempts to renew our Vault token. If the renewal fails, an error is -// returned. This method updates the lastRenewed time +// returned. This method updates the currentExpiration time func (v *vaultClient) renew() error { // Track how long the request takes defer metrics.MeasureSince([]string{"nomad", "vault", "renew"}, time.Now()) @@ -571,7 +576,8 @@ func (v *vaultClient) renew() error { return fmt.Errorf("renewal successful but no lease duration returned") } - v.lastRenewed = time.Now() + v.currentExpiration = time.Now().Add(time.Duration(auth.LeaseDuration) * time.Second) + v.logger.Debug("successfully renewed server token") return nil } @@ -618,7 +624,6 @@ func (v *vaultClient) parseSelfToken() error { if err := mapstructure.WeakDecode(self.Data, &data); err != nil { return fmt.Errorf("failed to parse Vault token's data block: %v", err) } - root := false for _, p := range data.Policies { if p == "root" { @@ -626,10 +631,9 @@ func (v *vaultClient) parseSelfToken() error { break } } - - // Store the token data data.Root = root v.tokenData = &data + v.currentExpiration = time.Now().Add(time.Duration(data.TTL) * time.Second) // The criteria that must be met for the token to be valid are as follows: // 1) If token is non-root or is but has a creation ttl @@ -648,7 +652,7 @@ func (v *vaultClient) parseSelfToken() error { var mErr multierror.Error role := v.getRole() - if !root { + if !data.Root { // All non-root tokens must be renewable if !data.Renewable { multierror.Append(&mErr, fmt.Errorf("Vault token is not renewable or root")) @@ -680,7 +684,7 @@ func (v *vaultClient) parseSelfToken() error { } // Check we have the correct capabilities - if err := v.validateCapabilities(role, root); err != nil { + if err := v.validateCapabilities(role, data.Root); err != nil { multierror.Append(&mErr, err) } @@ -1206,8 +1210,23 @@ func (v *vaultClient) setLimit(l rate.Limit) { v.limiter = rate.NewLimiter(l, int(l)) } -// Stats is used to query the state of the blocked eval tracker. -func (v *vaultClient) Stats() *VaultStats { +func (v *vaultClient) Stats() map[string]string { + stat := v.stats() + + expireTimeStr := "" + + if !stat.TokenExpiry.IsZero() { + expireTimeStr = stat.TokenExpiry.Format(time.RFC3339) + } + + return map[string]string{ + "tracked_for_revoked": strconv.Itoa(stat.TrackedForRevoke), + "token_ttl": stat.TokenTTL.String(), + "token_expire_time": expireTimeStr, + } +} + +func (v *vaultClient) stats() *VaultStats { // Allocate a new stats struct stats := new(VaultStats) @@ -1215,6 +1234,11 @@ func (v *vaultClient) Stats() *VaultStats { stats.TrackedForRevoke = len(v.revoking) v.revLock.Unlock() + stats.TokenExpiry = v.currentExpiration + if !stats.TokenExpiry.IsZero() { + stats.TokenTTL = time.Until(stats.TokenExpiry) + } + return stats } @@ -1225,6 +1249,8 @@ func (v *vaultClient) EmitStats(period time.Duration, stopCh chan struct{}) { case <-time.After(period): stats := v.stats() metrics.SetGauge([]string{"nomad", "vault", "distributed_tokens_revoking"}, float32(stats.TrackedForRevoke)) + metrics.SetGauge([]string{"nomad", "vault", "token_ttl"}, float32(stats.TokenTTL/time.Millisecond)) + case <-stopCh: return } diff --git a/nomad/vault_test.go b/nomad/vault_test.go index 3de2e079fbd..67dcc0b1d31 100644 --- a/nomad/vault_test.go +++ b/nomad/vault_test.go @@ -10,6 +10,8 @@ import ( "testing" "time" + "github.com/stretchr/testify/require" + "golang.org/x/time/rate" "github.com/hashicorp/nomad/helper" @@ -528,6 +530,49 @@ func TestVaultClient_RenewalLoop(t *testing.T) { if ttl == 0 { t.Fatalf("token renewal failed; ttl %v", ttl) } + + if client.currentExpiration.Before(time.Now()) { + t.Fatalf("found current expiration to be in past %s", time.Until(client.currentExpiration)) + } +} + +func TestVaultClientRenewUpdatesExpiration(t *testing.T) { + t.Parallel() + v := testutil.NewTestVault(t) + defer v.Stop() + + // Set the configs token in a new test role + v.Config.Token = defaultTestVaultWhitelistRoleAndToken(v, t, 5) + + // Start the client + logger := testlog.HCLogger(t) + client, err := NewVaultClient(v.Config, logger, nil) + if err != nil { + t.Fatalf("failed to build vault client: %v", err) + } + defer client.Stop() + + // Get the current TTL + a := v.Client.Auth().Token() + s2, err := a.Lookup(v.Config.Token) + if err != nil { + t.Fatalf("failed to lookup token: %v", err) + } + exp0 := time.Now().Add(time.Duration(parseTTLFromLookup(s2, t)) * time.Second) + + time.Sleep(1 * time.Second) + + err = client.renew() + require.NoError(t, err) + exp1 := client.currentExpiration + require.True(t, exp0.Before(exp1)) + + time.Sleep(1 * time.Second) + + err = client.renew() + require.NoError(t, err) + exp2 := client.currentExpiration + require.True(t, exp1.Before(exp2)) } func parseTTLFromLookup(s *vapi.Secret, t *testing.T) int64 { @@ -1114,7 +1159,7 @@ func TestVaultClient_RevokeTokens_PreEstablishs(t *testing.T) { t.Fatalf("didn't add to revoke loop") } - if client.Stats().TrackedForRevoke != 2 { + if client.stats().TrackedForRevoke != 2 { t.Fatalf("didn't add to revoke loop") } } From 91500c0823451ef5f6116445cf74e5fb5c79557f Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Fri, 9 Nov 2018 13:51:19 -0500 Subject: [PATCH 11/14] round ttl duration for users --- nomad/vault.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nomad/vault.go b/nomad/vault.go index ae0ba953ae7..11c6ba02ab7 100644 --- a/nomad/vault.go +++ b/nomad/vault.go @@ -1221,7 +1221,7 @@ func (v *vaultClient) Stats() map[string]string { return map[string]string{ "tracked_for_revoked": strconv.Itoa(stat.TrackedForRevoke), - "token_ttl": stat.TokenTTL.String(), + "token_ttl": stat.TokenTTL.Round(time.Second).String(), "token_expire_time": expireTimeStr, } } From 6445745c740ea66303ea2cc86b972aed387daeb2 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Tue, 20 Nov 2018 08:54:21 -0500 Subject: [PATCH 12/14] fix typo --- nomad/vault.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nomad/vault.go b/nomad/vault.go index 11c6ba02ab7..031780318cf 100644 --- a/nomad/vault.go +++ b/nomad/vault.go @@ -210,7 +210,7 @@ type vaultClient struct { // childTTL is the TTL for child tokens. childTTL string - // currentExpiration is the time the current tokean lease expires + // currentExpiration is the time the current token lease expires currentExpiration time.Time tomb *tomb.Tomb From 118300ae33ca18bf4191beaece8f3fe8e1eebdb1 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Tue, 20 Nov 2018 15:14:57 -0500 Subject: [PATCH 13/14] Renew past recorded expiry till unrecoverable error Keep attempting to renew Vault token past locally recorded expiry, just in case the token was renewed out of band, e.g. on another Nomad server, until Vault returns an unrecoverable error. --- nomad/vault.go | 44 ++++++++++++++++++++----- nomad/vault_test.go | 80 ++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 111 insertions(+), 13 deletions(-) diff --git a/nomad/vault.go b/nomad/vault.go index 031780318cf..3c2bc32eebf 100644 --- a/nomad/vault.go +++ b/nomad/vault.go @@ -207,6 +207,11 @@ type vaultClient struct { // running indicates whether the vault client is started. running bool + // renewLoopActive indicates whether the renewal goroutine is running + // It should be accessed and updated atomically + // used for testing purposes only + renewLoopActive int32 + // childTTL is the TTL for child tokens. childTTL string @@ -458,9 +463,16 @@ OUTER: v.l.Unlock() } +func (v *vaultClient) isRenewLoopActive() bool { + return atomic.LoadInt32(&v.renewLoopActive) == 1 +} + // renewalLoop runs the renew loop. This should only be called if we are given a // non-root token. func (v *vaultClient) renewalLoop() { + atomic.StoreInt32(&v.renewLoopActive, 1) + defer atomic.StoreInt32(&v.renewLoopActive, 0) + // Create the renewal timer and set initial duration to zero so it fires // immediately authRenewTimer := time.NewTimer(0) @@ -475,7 +487,7 @@ func (v *vaultClient) renewalLoop() { return case <-authRenewTimer.C: // Renew the token and determine the new expiration - err := v.renew() + recoverable, err := v.renew() currentExpiration := v.currentExpiration // Successfully renewed @@ -492,7 +504,12 @@ func (v *vaultClient) renewalLoop() { } metrics.IncrCounter([]string{"nomad", "vault", "renew_failed"}, 1) - v.logger.Warn("got error or bad auth, so backing off", "error", err) + v.logger.Warn("got error or bad auth, so backing off", "error", err, "recoverable", recoverable) + + if !recoverable { + return + } + backoff = nextBackoff(backoff, currentExpiration) if backoff < 0 { // We have failed to renew the token past its expiration. Stop @@ -518,6 +535,8 @@ func (v *vaultClient) renewalLoop() { // // It should increase the amount of backoff each time, with the following rules: // +// * If token expired already despite earlier renewal attempts, +// back off for 1 minute + jitter // * If we have an existing authentication that is going to expire, // never back off more than half of the amount of time remaining // until expiration (with 5s floor) @@ -527,8 +546,10 @@ func (v *vaultClient) renewalLoop() { // at the same time func nextBackoff(backoff float64, expiry time.Time) float64 { maxBackoff := time.Until(expiry) / 2 + if maxBackoff < 0 { - return -1 + // expiry passed + return 60 * (1.0 + rand.Float64()) } switch { @@ -553,15 +574,19 @@ func nextBackoff(backoff float64, expiry time.Time) float64 { } // renew attempts to renew our Vault token. If the renewal fails, an error is -// returned. This method updates the currentExpiration time -func (v *vaultClient) renew() error { +// returned. The boolean indicates whether it's safe to attempt to renew again. +// This method updates the currentExpiration time +func (v *vaultClient) renew() (bool, error) { // Track how long the request takes defer metrics.MeasureSince([]string{"nomad", "vault", "renew"}, time.Now()) // Attempt to renew the token secret, err := v.auth.RenewSelf(v.tokenData.CreationTTL) if err != nil { - return fmt.Errorf("failed to renew the vault token: %v", err) + + // Check if there is a permission denied + recoverable := !structs.VaultUnrecoverableError.MatchString(err.Error()) + return recoverable, fmt.Errorf("failed to renew the vault token: %v", err) } if secret == nil { // It's possible for RenewSelf to return (nil, nil) if the @@ -569,17 +594,18 @@ func (v *vaultClient) renew() error { return fmt.Errorf("renewal failed: empty response from vault") } + // these treated as transient errors, where can keep renewing auth := secret.Auth if auth == nil { - return fmt.Errorf("renewal successful but not auth information returned") + return true, fmt.Errorf("renewal successful but not auth information returned") } else if auth.LeaseDuration == 0 { - return fmt.Errorf("renewal successful but no lease duration returned") + return true, fmt.Errorf("renewal successful but no lease duration returned") } v.currentExpiration = time.Now().Add(time.Duration(auth.LeaseDuration) * time.Second) v.logger.Debug("successfully renewed server token") - return nil + return true, nil } // getWrappingFn returns an appropriate wrapping function for Nomad Servers diff --git a/nomad/vault_test.go b/nomad/vault_test.go index 67dcc0b1d31..089efca3e8d 100644 --- a/nomad/vault_test.go +++ b/nomad/vault_test.go @@ -3,6 +3,7 @@ package nomad import ( "context" "encoding/json" + "errors" "fmt" "math/rand" "reflect" @@ -10,6 +11,7 @@ import ( "testing" "time" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "golang.org/x/time/rate" @@ -562,19 +564,89 @@ func TestVaultClientRenewUpdatesExpiration(t *testing.T) { time.Sleep(1 * time.Second) - err = client.renew() + _, err = client.renew() require.NoError(t, err) exp1 := client.currentExpiration require.True(t, exp0.Before(exp1)) time.Sleep(1 * time.Second) - err = client.renew() + _, err = client.renew() require.NoError(t, err) exp2 := client.currentExpiration require.True(t, exp1.Before(exp2)) } +func TestVaultClient_StopsAfterPermissionError(t *testing.T) { + t.Parallel() + v := testutil.NewTestVault(t) + defer v.Stop() + + // Set the configs token in a new test role + v.Config.Token = defaultTestVaultWhitelistRoleAndToken(v, t, 2) + + // Start the client + logger := testlog.HCLogger(t) + client, err := NewVaultClient(v.Config, logger, nil) + if err != nil { + t.Fatalf("failed to build vault client: %v", err) + } + defer client.Stop() + + time.Sleep(500 * time.Millisecond) + + assert.True(t, client.isRenewLoopActive()) + + // Get the current TTL + a := v.Client.Auth().Token() + assert.NoError(t, a.RevokeSelf("")) + + testutil.WaitForResult(func() (bool, error) { + if !client.isRenewLoopActive() { + return true, nil + } else { + return false, errors.New("renew loop should terminate after token is revoked") + } + }, func(err error) { + t.Fatalf("err: %v", err) + }) +} +func TestVaultClient_LoopsUntilCannotRenew(t *testing.T) { + t.Parallel() + v := testutil.NewTestVault(t) + defer v.Stop() + + // Set the configs token in a new test role + v.Config.Token = defaultTestVaultWhitelistRoleAndToken(v, t, 5) + + // Start the client + logger := testlog.HCLogger(t) + client, err := NewVaultClient(v.Config, logger, nil) + if err != nil { + t.Fatalf("failed to build vault client: %v", err) + } + defer client.Stop() + + // Sleep 8 seconds and ensure we have a non-zero TTL + time.Sleep(8 * time.Second) + + // Get the current TTL + a := v.Client.Auth().Token() + s2, err := a.Lookup(v.Config.Token) + if err != nil { + t.Fatalf("failed to lookup token: %v", err) + } + + ttl := parseTTLFromLookup(s2, t) + if ttl == 0 { + t.Fatalf("token renewal failed; ttl %v", ttl) + } + + if client.currentExpiration.Before(time.Now()) { + t.Fatalf("found current expiration to be in past %s", time.Until(client.currentExpiration)) + } +} + func parseTTLFromLookup(s *vapi.Secret, t *testing.T) int64 { if s == nil { t.Fatalf("nil secret") @@ -1337,8 +1409,8 @@ func TestVaultClient_nextBackoff(t *testing.T) { t.Run("past expiry", func(t *testing.T) { b := nextBackoff(20, time.Now().Add(-1100*time.Millisecond)) - if b >= 0.0 { - t.Fatalf("Expected backoff to be negative but found %v", b) + if !(60 <= b && b <= 120) { + t.Fatalf("Expected backoff within [%v, %v] but found %v", 60, 120, b) } }) } From 3a57b9c2fee2aced5f263ea7e6844f444657b0c2 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Tue, 20 Nov 2018 17:11:55 -0500 Subject: [PATCH 14/14] nil secrets as recoverable to keep renew attempts --- nomad/vault.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nomad/vault.go b/nomad/vault.go index 3c2bc32eebf..cedfd056402 100644 --- a/nomad/vault.go +++ b/nomad/vault.go @@ -583,15 +583,15 @@ func (v *vaultClient) renew() (bool, error) { // Attempt to renew the token secret, err := v.auth.RenewSelf(v.tokenData.CreationTTL) if err != nil { - // Check if there is a permission denied recoverable := !structs.VaultUnrecoverableError.MatchString(err.Error()) return recoverable, fmt.Errorf("failed to renew the vault token: %v", err) } + if secret == nil { // It's possible for RenewSelf to return (nil, nil) if the // response body from Vault is empty. - return fmt.Errorf("renewal failed: empty response from vault") + return true, fmt.Errorf("renewal failed: empty response from vault") } // these treated as transient errors, where can keep renewing