From e9927851bac8fba691742c7927789bb9c016d2f5 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Wed, 5 Aug 2020 15:26:45 -0700 Subject: [PATCH] vault: log once per interval if batching revocation This log line should be rare since: 1. Most tokens should be logged synchronously, not via this async batched method. Async revocation only takes place when Vault connectivity is lost and after leader election so no revocations are missed. 2. There should rarely be >1 batch (1,000) tokens to revoke since the above conditions should be brief and infrequent. 3. Interval is 5 minutes, so this log line will be emitted at *most* once every 5 minutes. What makes this log line rare is also what makes it interesting: due to a bug prior to Nomad 0.11.2 some tokens may never get revoked. Therefore Nomad tries to re-revoke them on every leader election. This caused a massive buildup of old tokens that would never be properly revoked and purged. Nomad 0.11.3 mostly fixed this but still had a bug in purging revoked tokens via Raft (fixed in #8553). The nomad.vault.distributed_tokens_revoked metric is only ticked upon successful revocation and purging, making any bugs or slowness in the process difficult to detect. Logging before a potentially slow revocation+purge operation is performed will give users much better indications of what activity is going on should the process fail to make it to the metric. --- nomad/vault.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nomad/vault.go b/nomad/vault.go index 19b73035129..3043d896edc 100644 --- a/nomad/vault.go +++ b/nomad/vault.go @@ -1303,6 +1303,9 @@ func (v *vaultClient) revokeDaemon() { // Build the list of accessors that need to be revoked while pruning any TTL'd checks toRevoke := len(v.revoking) if toRevoke > v.maxRevokeBatchSize { + v.logger.Info("batching tokens to be revoked", + "to_revoke", toRevoke, "batch_size", v.maxRevokeBatchSize, + "batch_interval", v.revocationIntv) toRevoke = v.maxRevokeBatchSize } revoking := make([]*structs.VaultAccessor, 0, toRevoke)