From d85a08499038da60b5923ec2d3dbff6836b33df5 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Mon, 26 Sep 2022 10:47:08 -0700 Subject: [PATCH] fingerprint: lengthen Vault check after seen Extension of #14673 Once Vault is initially fingerprinted, extend the period since changes should be infrequent and the fingerprint is relatively expensive since it is contacting a central Vault server. Also move the period timer reset *after* the fingerprint. This is similar to #9435 where the idea is to ensure the retry period starts *after* the operation is attempted. 15s will be the *minimum* time between fingerprints now instead of the *maximum* time between fingerprints. In the case of Vault fingerprinting, the original behavior might cause the following: 1. Timer is reset to 15s 2. Fingerprint takes 16s 3. Timer has already elapsed so we immediately Fingerprint again Even if fingerprinting Vault only takes a few seconds, that may very well be due to excessive load and backing off our fingerprints is desirable. The new bevahior ensures we always wait at least 15s between fingerprint attempts and should allow some natural jittering based on server load and network latency. --- client/fingerprint/vault.go | 7 +++++ client/fingerprint/vault_test.go | 27 +++++++++++++++++++ client/fingerprint_manager.go | 13 ++++----- .../content/docs/upgrade/upgrade-specific.mdx | 2 +- 4 files changed, 42 insertions(+), 7 deletions(-) diff --git a/client/fingerprint/vault.go b/client/fingerprint/vault.go index 5639e861c48..c887ddcef1e 100644 --- a/client/fingerprint/vault.go +++ b/client/fingerprint/vault.go @@ -7,6 +7,7 @@ import ( "time" log "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/helper" vapi "github.com/hashicorp/vault/api" ) @@ -78,5 +79,11 @@ func (f *VaultFingerprint) Fingerprint(req *FingerprintRequest, resp *Fingerprin } func (f *VaultFingerprint) Periodic() (bool, time.Duration) { + if f.lastState == vaultAvailable { + // Fingerprint infrequently once Vault is initially discovered with wide + // jitter to avoid thundering herds of fingerprints against central Vault + // servers. + return true, (30 * time.Second) + helper.RandomStagger(90*time.Second) + } return true, 15 * time.Second } diff --git a/client/fingerprint/vault_test.go b/client/fingerprint/vault_test.go index c56ff91381a..32cd8f3b412 100644 --- a/client/fingerprint/vault_test.go +++ b/client/fingerprint/vault_test.go @@ -2,6 +2,7 @@ package fingerprint import ( "testing" + "time" "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/client/config" @@ -21,6 +22,14 @@ func TestVaultFingerprint(t *testing.T) { Attributes: make(map[string]string), } + p, period := fp.Periodic() + if !p { + t.Fatalf("expected fingerprint to be periodic") + } + if period != (15 * time.Second) { + t.Fatalf("expected period to be 15s but found: %s", period) + } + conf := config.DefaultConfig() conf.VaultConfig = tv.Config @@ -40,6 +49,16 @@ func TestVaultFingerprint(t *testing.T) { assertNodeAttributeContains(t, response.Attributes, "vault.cluster_id") assertNodeAttributeContains(t, response.Attributes, "vault.cluster_name") + // Period should be longer after initial discovery + p, period = fp.Periodic() + if !p { + t.Fatalf("expected fingerprint to be periodic") + } + if period < (30*time.Second) || period > (2*time.Minute) { + t.Fatalf("expected period to be between 30s and 2m but found: %s", period) + } + + // Stop Vault to simulate it being unavailable tv.Stop() err = fp.Fingerprint(request, &response) @@ -56,4 +75,12 @@ func TestVaultFingerprint(t *testing.T) { assertNodeAttributeContains(t, response.Attributes, "vault.cluster_id") assertNodeAttributeContains(t, response.Attributes, "vault.cluster_name") + // Period should be original once trying to discover Vault is available again + p, period = fp.Periodic() + if !p { + t.Fatalf("expected fingerprint to be periodic") + } + if period != (15 * time.Second) { + t.Fatalf("expected period to be 15s but found: %s", period) + } } diff --git a/client/fingerprint_manager.go b/client/fingerprint_manager.go index 8a08a077a2c..1886c8df551 100644 --- a/client/fingerprint_manager.go +++ b/client/fingerprint_manager.go @@ -142,9 +142,9 @@ func (fm *FingerprintManager) setupFingerprinters(fingerprints []string) error { appliedFingerprints = append(appliedFingerprints, name) } - p, period := f.Periodic() + p, _ := f.Periodic() if p { - go fm.runFingerprint(f, period, name) + go fm.runFingerprint(f, name) } if rfp, ok := f.(fingerprint.ReloadableFingerprint); ok { @@ -157,8 +157,9 @@ func (fm *FingerprintManager) setupFingerprinters(fingerprints []string) error { } // runFingerprint runs each fingerprinter individually on an ongoing basis -func (fm *FingerprintManager) runFingerprint(f fingerprint.Fingerprint, period time.Duration, name string) { - fm.logger.Debug("fingerprinting periodically", "fingerprinter", name, "period", period) +func (fm *FingerprintManager) runFingerprint(f fingerprint.Fingerprint, name string) { + _, period := f.Periodic() + fm.logger.Debug("fingerprinting periodically", "fingerprinter", name, "initial_period", period) timer := time.NewTimer(period) defer timer.Stop() @@ -166,14 +167,14 @@ func (fm *FingerprintManager) runFingerprint(f fingerprint.Fingerprint, period t for { select { case <-timer.C: - timer.Reset(period) - _, err := fm.fingerprint(name, f) if err != nil { fm.logger.Debug("error periodic fingerprinting", "error", err, "fingerprinter", name) continue } + _, period = f.Periodic() + timer.Reset(period) case <-fm.shutdownCh: return } diff --git a/website/content/docs/upgrade/upgrade-specific.mdx b/website/content/docs/upgrade/upgrade-specific.mdx index b9eb22281c3..9a98d2d20fb 100644 --- a/website/content/docs/upgrade/upgrade-specific.mdx +++ b/website/content/docs/upgrade/upgrade-specific.mdx @@ -62,7 +62,7 @@ the `-json` flag is provided. Nomad clients no longer have their Consul and Vault fingerprints cleared when connectivity is lost with Consul and Vault. To intentionally remove Consul and -Vault from a client node, you will need to restart the client. +Vault from a client node, you will need to restart the Nomad client agent. ## Nomad 1.3.3