From e3ab3629f2388fa50f1bbccb207e5089b52bd191 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Fri, 19 Jul 2024 17:29:41 +0000 Subject: [PATCH] backport of commit 2f4353412d1bcf305898f676a815d342304ffa28 --- .changelog/23577.txt | 15 ++ api/keyring.go | 18 +- command/agent/keyring_endpoint.go | 10 + command/agent/keyring_endpoint_test.go | 82 ++++-- command/operator_root_keyring.go | 10 +- command/operator_root_keyring_rotate.go | 45 +++- nomad/core_sched.go | 139 ++++++++-- nomad/core_sched_test.go | 240 +++++++++++++----- nomad/encrypter.go | 10 +- nomad/encrypter_test.go | 2 +- nomad/keyring_endpoint.go | 11 +- nomad/keyring_endpoint_test.go | 80 ++++-- nomad/leader.go | 2 +- nomad/state/state_store.go | 12 +- nomad/state/state_store_test.go | 11 +- nomad/structs/keyring.go | 102 ++++++-- .../commands/operator/root/keyring-rotate.mdx | 31 ++- website/content/docs/configuration/server.mdx | 16 +- .../content/docs/integrations/consul/acl.mdx | 8 +- .../content/docs/integrations/vault/acl.mdx | 8 +- .../content/docs/upgrade/upgrade-specific.mdx | 14 +- 21 files changed, 652 insertions(+), 214 deletions(-) create mode 100644 .changelog/23577.txt diff --git a/.changelog/23577.txt b/.changelog/23577.txt new file mode 100644 index 00000000000..353aa2b9ceb --- /dev/null +++ b/.changelog/23577.txt @@ -0,0 +1,15 @@ +```release-note:improvement +keyring: Added support for prepublishing keys +``` + +```release-note:bug +keyring: Fixed a bug where periodic key rotation would not occur +``` + +```release-note:bug +keyring: Fixed a bug where keys could be garbage collected before workload identities expire +``` + +```release-note:bug +keyring: Fixed a bug where keys would never exit the "rekeying" state after a rotation with the `-full` flag +``` diff --git a/api/keyring.go b/api/keyring.go index d87d8b720fc..69e0c9c1073 100644 --- a/api/keyring.go +++ b/api/keyring.go @@ -34,16 +34,18 @@ type RootKeyMeta struct { CreateIndex uint64 ModifyIndex uint64 State RootKeyState + PublishTime int64 } // RootKeyState enum describes the lifecycle of a root key. type RootKeyState string const ( - RootKeyStateInactive RootKeyState = "inactive" - RootKeyStateActive = "active" - RootKeyStateRekeying = "rekeying" - RootKeyStateDeprecated = "deprecated" + RootKeyStateInactive RootKeyState = "inactive" + RootKeyStateActive = "active" + RootKeyStateRekeying = "rekeying" + RootKeyStateDeprecated = "deprecated" + RootKeyStatePrepublished = "prepublished" ) // List lists all the keyring metadata @@ -78,6 +80,9 @@ func (k *Keyring) Rotate(opts *KeyringRotateOptions, w *WriteOptions) (*RootKeyM if opts.Full { qp.Set("full", "true") } + if opts.PublishTime > 0 { + qp.Set("publish_time", fmt.Sprintf("%d", opts.PublishTime)) + } } resp := &struct{ Key *RootKeyMeta }{} wm, err := k.client.put("/v1/operator/keyring/rotate?"+qp.Encode(), nil, resp, w) @@ -86,6 +91,7 @@ func (k *Keyring) Rotate(opts *KeyringRotateOptions, w *WriteOptions) (*RootKeyM // KeyringRotateOptions are parameters for the Rotate API type KeyringRotateOptions struct { - Full bool - Algorithm EncryptionAlgorithm + Full bool + Algorithm EncryptionAlgorithm + PublishTime int64 } diff --git a/command/agent/keyring_endpoint.go b/command/agent/keyring_endpoint.go index b753507323c..870aae3f06c 100644 --- a/command/agent/keyring_endpoint.go +++ b/command/agent/keyring_endpoint.go @@ -6,6 +6,7 @@ package agent import ( "fmt" "net/http" + "strconv" "strings" "time" @@ -167,6 +168,15 @@ func (s *HTTPServer) keyringRotateRequest(resp http.ResponseWriter, req *http.Re args.Full = true } + ptRaw := query.Get("publish_time") + if ptRaw != "" { + publishTime, err := strconv.ParseInt(ptRaw, 10, 64) + if err != nil { + return nil, fmt.Errorf("invalid publish_time: %w", err) + } + args.PublishTime = publishTime + } + var out structs.KeyringRotateRootKeyResponse if err := s.agent.RPC("Keyring.Rotate", &args, &out); err != nil { return nil, err diff --git a/command/agent/keyring_endpoint_test.go b/command/agent/keyring_endpoint_test.go index 938642795af..428da9d5078 100644 --- a/command/agent/keyring_endpoint_test.go +++ b/command/agent/keyring_endpoint_test.go @@ -4,6 +4,7 @@ package agent import ( + "fmt" "net/http" "net/http/httptest" "strconv" @@ -13,7 +14,6 @@ import ( "github.com/go-jose/go-jose/v3" "github.com/shoenig/test/must" - "github.com/stretchr/testify/require" "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/nomad/structs" @@ -29,57 +29,83 @@ func TestHTTP_Keyring_CRUD(t *testing.T) { // List (get bootstrap key) req, err := http.NewRequest(http.MethodGet, "/v1/operator/keyring/keys", nil) - require.NoError(t, err) + must.NoError(t, err) obj, err := s.Server.KeyringRequest(respW, req) - require.NoError(t, err) + must.NoError(t, err) listResp := obj.([]*structs.RootKeyMeta) - require.Len(t, listResp, 1) - oldKeyID := listResp[0].KeyID + must.Len(t, 1, listResp) + key0 := listResp[0].KeyID // Rotate req, err = http.NewRequest(http.MethodPut, "/v1/operator/keyring/rotate", nil) - require.NoError(t, err) + must.NoError(t, err) obj, err = s.Server.KeyringRequest(respW, req) - require.NoError(t, err) - require.NotZero(t, respW.HeaderMap.Get("X-Nomad-Index")) + must.NoError(t, err) + must.NotEq(t, "", respW.HeaderMap.Get("X-Nomad-Index")) rotateResp := obj.(structs.KeyringRotateRootKeyResponse) - require.NotNil(t, rotateResp.Key) - require.True(t, rotateResp.Key.Active()) - newID1 := rotateResp.Key.KeyID + must.NotNil(t, rotateResp.Key) + must.True(t, rotateResp.Key.IsActive()) + key1 := rotateResp.Key.KeyID + + // Rotate with prepublish + + publishTime := time.Now().Add(24 * time.Hour).UnixNano() + req, err = http.NewRequest(http.MethodPut, + fmt.Sprintf("/v1/operator/keyring/rotate?publish_time=%d", publishTime), nil) + must.NoError(t, err) + obj, err = s.Server.KeyringRequest(respW, req) + must.NoError(t, err) + must.NotEq(t, "", respW.HeaderMap.Get("X-Nomad-Index")) + rotateResp = obj.(structs.KeyringRotateRootKeyResponse) + must.NotNil(t, rotateResp.Key) + must.True(t, rotateResp.Key.IsPrepublished()) + key2 := rotateResp.Key.KeyID // List req, err = http.NewRequest(http.MethodGet, "/v1/operator/keyring/keys", nil) - require.NoError(t, err) + must.NoError(t, err) obj, err = s.Server.KeyringRequest(respW, req) - require.NoError(t, err) + must.NoError(t, err) listResp = obj.([]*structs.RootKeyMeta) - require.Len(t, listResp, 2) + must.Len(t, 3, listResp) for _, key := range listResp { - if key.KeyID == newID1 { - require.True(t, key.Active(), "new key should be active") - } else { - require.False(t, key.Active(), "initial key should be inactive") + switch key.KeyID { + case key0: + must.True(t, key.IsInactive(), must.Sprint("initial key should be inactive")) + case key1: + must.True(t, key.IsActive(), must.Sprint("new key should be active")) + case key2: + must.True(t, key.IsPrepublished(), + must.Sprint("prepublished key should not be active")) } } - // Delete the old key and verify its gone + // Delete the original key and verify its gone - req, err = http.NewRequest(http.MethodDelete, "/v1/operator/keyring/key/"+oldKeyID, nil) - require.NoError(t, err) + req, err = http.NewRequest(http.MethodDelete, "/v1/operator/keyring/key/"+key0, nil) + must.NoError(t, err) obj, err = s.Server.KeyringRequest(respW, req) - require.NoError(t, err) + must.NoError(t, err) req, err = http.NewRequest(http.MethodGet, "/v1/operator/keyring/keys", nil) - require.NoError(t, err) + must.NoError(t, err) obj, err = s.Server.KeyringRequest(respW, req) - require.NoError(t, err) + must.NoError(t, err) listResp = obj.([]*structs.RootKeyMeta) - require.Len(t, listResp, 1) - require.Equal(t, newID1, listResp[0].KeyID) - require.True(t, listResp[0].Active()) - require.Len(t, listResp, 1) + must.Len(t, 2, listResp) + for _, key := range listResp { + switch key.KeyID { + case key0: + t.Fatalf("initial key should have been deleted") + case key1: + must.True(t, key.IsActive(), must.Sprint("new key should be active")) + case key2: + must.True(t, key.IsPrepublished(), + must.Sprint("prepublished key should not be active")) + } + } }) } diff --git a/command/operator_root_keyring.go b/command/operator_root_keyring.go index 63a3a14e8dc..b92e1a57733 100644 --- a/command/operator_root_keyring.go +++ b/command/operator_root_keyring.go @@ -75,11 +75,15 @@ func renderVariablesKeysResponse(keys []*api.RootKeyMeta, verbose bool) string { length = 8 } out := make([]string, len(keys)+1) - out[0] = "Key|State|Create Time" + out[0] = "Key|State|Create Time|Publish Time" i := 1 for _, k := range keys { - out[i] = fmt.Sprintf("%s|%v|%s", - k.KeyID[:length], k.State, formatUnixNanoTime(k.CreateTime)) + publishTime := "" + if k.PublishTime > 0 { + publishTime = formatUnixNanoTime(k.PublishTime) + } + out[i] = fmt.Sprintf("%s|%v|%s|%s", + k.KeyID[:length], k.State, formatUnixNanoTime(k.CreateTime), publishTime) i = i + 1 } return formatList(out) diff --git a/command/operator_root_keyring_rotate.go b/command/operator_root_keyring_rotate.go index 9c274c77292..70271666cee 100644 --- a/command/operator_root_keyring_rotate.go +++ b/command/operator_root_keyring_rotate.go @@ -6,6 +6,7 @@ package command import ( "fmt" "strings" + "time" "github.com/hashicorp/nomad/api" "github.com/posener/complete" @@ -36,6 +37,17 @@ Keyring Options: will immediately return and the re-encryption process will run asynchronously on the leader. + -now + Publish the new key immediately without prepublishing. One of -now or + -prepublish must be set. + + -prepublish + Set a duration for which to prepublish the new key (ex. "1h"). The currently + active key will be unchanged but the new public key will be available in the + JWKS endpoint. Multiple keys can be prepublished and they will be promoted to + active in order of publish time, at most once every root_key_gc_interval. One + of -now or -prepublish must be set. + -verbose Show full information. ` @@ -50,8 +62,10 @@ func (c *OperatorRootKeyringRotateCommand) Synopsis() string { func (c *OperatorRootKeyringRotateCommand) AutocompleteFlags() complete.Flags { return mergeAutocompleteFlags(c.Meta.AutocompleteFlags(FlagSetClient), complete.Flags{ - "-full": complete.PredictNothing, - "-verbose": complete.PredictNothing, + "-full": complete.PredictNothing, + "-now": complete.PredictNothing, + "-prepublish": complete.PredictNothing, + "-verbose": complete.PredictNothing, }) } @@ -64,12 +78,15 @@ func (c *OperatorRootKeyringRotateCommand) Name() string { } func (c *OperatorRootKeyringRotateCommand) Run(args []string) int { - var rotateFull, verbose bool + var rotateFull, rotateNow, verbose bool + var prepublishDuration time.Duration flags := c.Meta.FlagSet("root keyring rotate", FlagSetClient) flags.Usage = func() { c.Ui.Output(c.Help()) } flags.BoolVar(&rotateFull, "full", false, "full key rotation") + flags.BoolVar(&rotateNow, "now", false, "immediately rotate without prepublish") flags.BoolVar(&verbose, "verbose", false, "") + flags.DurationVar(&prepublishDuration, "prepublish", 0, "prepublish key") if err := flags.Parse(args); err != nil { return 1 @@ -88,8 +105,28 @@ func (c *OperatorRootKeyringRotateCommand) Run(args []string) int { return 1 } + if !rotateNow && prepublishDuration == 0 || rotateNow && prepublishDuration != 0 { + c.Ui.Error(` +One of "-now" or "-prepublish" must be used. + +If a key has been leaked use "-now" to force immediate rotation. + +Otherwise please use "-prepublish " to ensure the new key is not used +to sign workload identities before JWKS endpoints are updated. +`) + return 1 + } + + publishTime := int64(0) + if prepublishDuration > 0 { + publishTime = time.Now().UnixNano() + prepublishDuration.Nanoseconds() + } + resp, _, err := client.Keyring().Rotate( - &api.KeyringRotateOptions{Full: rotateFull}, nil) + &api.KeyringRotateOptions{ + Full: rotateFull, + PublishTime: publishTime, + }, nil) if err != nil { c.Ui.Error(fmt.Sprintf("error: %s", err)) return 1 diff --git a/nomad/core_sched.go b/nomad/core_sched.go index ebbeecb234c..89afb6c3489 100644 --- a/nomad/core_sched.go +++ b/nomad/core_sched.go @@ -99,7 +99,7 @@ func (c *CoreScheduler) forceGC(eval *structs.Evaluation) error { if err := c.expiredACLTokenGC(eval, true); err != nil { return err } - if err := c.rootKeyGC(eval); err != nil { + if err := c.rootKeyGC(eval, time.Now()); err != nil { return err } // Node GC must occur after the others to ensure the allocations are @@ -902,20 +902,17 @@ func (c *CoreScheduler) rootKeyRotateOrGC(eval *structs.Evaluation) error { // a rotation will be sent to the leader so our view of state // is no longer valid. we ack this core job and will pick up // the GC work on the next interval - wasRotated, err := c.rootKeyRotate(eval) + wasRotated, err := c.rootKeyRotate(eval, time.Now()) if err != nil { return err } if wasRotated { return nil } - return c.rootKeyGC(eval) + return c.rootKeyGC(eval, time.Now()) } -func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation) error { - - oldThreshold := c.getThreshold(eval, "root key", - "root_key_gc_threshold", c.srv.config.RootKeyGCThreshold) +func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation, now time.Time) error { ws := memdb.NewWatchSet() iter, err := c.snap.RootKeyMetas(ws) @@ -923,19 +920,31 @@ func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation) error { return err } + // the threshold is longer than we can support with the time table, and we + // never want to force-GC keys because that will orphan signed Workload + // Identities + rotationThreshold := now.Add(-1 * + (c.srv.config.RootKeyRotationThreshold + c.srv.config.RootKeyGCThreshold)) + for { raw := iter.Next() if raw == nil { break } keyMeta := raw.(*structs.RootKeyMeta) - if keyMeta.Active() || keyMeta.Rekeying() { - continue // never GC the active key or one we're rekeying + if !keyMeta.IsInactive() { + continue // never GC keys we're still using } - if keyMeta.CreateIndex > oldThreshold { - continue // don't GC recent keys + + c.logger.Trace("checking inactive key eligibility for gc", + "create_time", keyMeta.CreateTime, "threshold", rotationThreshold.UnixNano()) + + if keyMeta.CreateTime > rotationThreshold.UnixNano() { + continue // don't GC keys with potentially live Workload Identities } + // don't GC keys used to encrypt Variables or sign legacy non-expiring + // Workload Identities inUse, err := c.snap.IsRootKeyMetaInUse(keyMeta.KeyID) if err != nil { return err @@ -961,26 +970,97 @@ func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation) error { return nil } -// rootKeyRotate checks if the active key is old enough that we need -// to kick off a rotation. -func (c *CoreScheduler) rootKeyRotate(eval *structs.Evaluation) (bool, error) { - - rotationThreshold := c.getThreshold(eval, "root key", - "root_key_rotation_threshold", c.srv.config.RootKeyRotationThreshold) +// rootKeyRotate checks if the active key is old enough that we need to kick off +// a rotation. It prepublishes a key first and only promotes that prepublished +// key to active once the rotation threshold has expired +func (c *CoreScheduler) rootKeyRotate(eval *structs.Evaluation, now time.Time) (bool, error) { ws := memdb.NewWatchSet() - activeKey, err := c.snap.GetActiveRootKeyMeta(ws) + iter, err := c.snap.RootKeyMetas(ws) if err != nil { return false, err } + + var ( + activeKey *structs.RootKeyMeta + prepublishedKey *structs.RootKeyMeta + ) + + for raw := iter.Next(); raw != nil; raw = iter.Next() { + key := raw.(*structs.RootKeyMeta) + switch key.State { + case structs.RootKeyStateActive: + activeKey = key + case structs.RootKeyStatePrepublished: + // multiple keys can be prepublished, so we only want to handle the + // very next one + if prepublishedKey == nil { + prepublishedKey = key + } else if prepublishedKey.PublishTime > key.PublishTime { + prepublishedKey = key + } + } + } + + if prepublishedKey != nil { + c.logger.Trace("checking prepublished key eligibility for promotion", + "publish_time", prepublishedKey.PublishTime, "now", now.UnixNano()) + + if prepublishedKey.PublishTime > now.UnixNano() { + // at this point we have a key in a prepublished state but it's not + // ready to be made active, so we bail out. otherwise we'd kick off + // a new rotation every time we process this eval and we're past + // internval/2 + return false, nil + } + + rootKey, err := c.srv.encrypter.GetKey(prepublishedKey.KeyID) + if err != nil { + c.logger.Error("prepublished key does not exist in keyring", "error", err) + return false, nil + } + rootKey = rootKey.MakeActive() + + req := &structs.KeyringUpdateRootKeyRequest{ + RootKey: rootKey, + WriteRequest: structs.WriteRequest{ + Region: c.srv.config.Region, + AuthToken: eval.LeaderACL, + }, + } + + if err := c.srv.RPC("Keyring.Update", + req, &structs.KeyringUpdateRootKeyResponse{}); err != nil { + c.logger.Error("setting prepublished key active failed", "error", err) + return false, err + } + return true, nil + } + + // There's no prepublished key so prepublish one now + if activeKey == nil { - return false, nil // no active key + c.logger.Warn("keyring has no active key: rotate keyring to repair") + return false, nil } - if activeKey.CreateIndex >= rotationThreshold { + + // we rotate at half the rotation threshold because we want to prepublish a key + rotationThreshold := now.Add(-1 * c.srv.config.RootKeyRotationThreshold / 2) + + c.logger.Trace("checking active key eligibility for rotation", + "create_time", activeKey.CreateTime, "threshold", rotationThreshold.UnixNano()) + + if activeKey.CreateTime > rotationThreshold.UnixNano() { return false, nil // key is too new } + // this eval may be processed up to RootKeyGCInterval after the halfway + // mark, so use the CreateTime of the previous key rather than the wall + // clock to set the publish time + publishTime := activeKey.CreateTime + c.srv.config.RootKeyRotationThreshold.Nanoseconds() + req := &structs.KeyringRotateRootKeyRequest{ + PublishTime: publishTime, WriteRequest: structs.WriteRequest{ Region: c.srv.config.Region, AuthToken: eval.LeaderACL, @@ -1014,7 +1094,7 @@ func (c *CoreScheduler) variablesRekey(eval *structs.Evaluation) error { break } keyMeta := raw.(*structs.RootKeyMeta) - if !keyMeta.Rekeying() { + if !keyMeta.IsRekeying() { continue } varIter, err := c.snap.GetVariablesByKeyID(ws, keyMeta.KeyID) @@ -1026,6 +1106,23 @@ func (c *CoreScheduler) variablesRekey(eval *structs.Evaluation) error { return err } + rootKey, err := c.srv.encrypter.GetKey(keyMeta.KeyID) + if err != nil { + return fmt.Errorf("rotated key does not exist in keyring: %w", err) + } + rootKey = rootKey.MakeInactive() + + req := &structs.KeyringUpdateRootKeyRequest{ + RootKey: rootKey, + WriteRequest: structs.WriteRequest{ + Region: c.srv.config.Region, + AuthToken: eval.LeaderACL}, + } + if err := c.srv.RPC("Keyring.Update", + req, &structs.KeyringUpdateRootKeyResponse{}); err != nil { + c.logger.Error("rekey complete but failed to mark key as inactive", "error", err) + return err + } } return nil diff --git a/nomad/core_sched_test.go b/nomad/core_sched_test.go index 2f1347c896d..76a04a67de9 100644 --- a/nomad/core_sched_test.go +++ b/nomad/core_sched_test.go @@ -19,6 +19,7 @@ import ( "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/testutil" "github.com/shoenig/test/must" + "github.com/shoenig/test/wait" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -2609,32 +2610,137 @@ func TestCoreScheduler_CSIBadState_ClaimGC(t *testing.T) { } } +// TestCoreScheduler_RootKeyRotate exercises periodic rotation of the root key +func TestCoreScheduler_RootKeyRotate(t *testing.T) { + ci.Parallel(t) + + srv, cleanup := TestServer(t, func(c *Config) { + c.NumSchedulers = 0 + c.RootKeyRotationThreshold = time.Hour + }) + defer cleanup() + testutil.WaitForKeyring(t, srv.RPC, "global") + + // active key, will never be GC'd + store := srv.fsm.State() + key0, err := store.GetActiveRootKeyMeta(nil) + must.NotNil(t, key0, must.Sprint("expected keyring to be bootstapped")) + must.NoError(t, err) + + // run the core job + snap, err := store.Snapshot() + must.NoError(t, err) + core := NewCoreScheduler(srv, snap) + index := key0.ModifyIndex + 1 + eval := srv.coreJobEval(structs.CoreJobRootKeyRotateOrGC, index) + c := core.(*CoreScheduler) + + // Eval immediately + now := time.Unix(0, key0.CreateTime) + rotated, err := c.rootKeyRotate(eval, now) + must.NoError(t, err) + must.False(t, rotated, must.Sprint("key should not rotate")) + + // Eval after half threshold has passed + c.snap, _ = store.Snapshot() + now = time.Unix(0, key0.CreateTime+(time.Minute*40).Nanoseconds()) + rotated, err = c.rootKeyRotate(eval, now) + must.NoError(t, err) + must.True(t, rotated, must.Sprint("key should rotate")) + + var key1 *structs.RootKeyMeta + iter, err := store.RootKeyMetas(nil) + must.NoError(t, err) + for raw := iter.Next(); raw != nil; raw = iter.Next() { + k := raw.(*structs.RootKeyMeta) + if k.KeyID == key0.KeyID { + must.True(t, k.IsActive(), must.Sprint("expected original key to be active")) + } else { + key1 = k + } + } + must.NotNil(t, key1) + must.True(t, key1.IsPrepublished()) + must.Eq(t, key0.CreateTime+time.Hour.Nanoseconds(), key1.PublishTime) + + // Externally rotate with prepublish to add a second prepublished key + resp := &structs.KeyringRotateRootKeyResponse{} + must.NoError(t, srv.RPC("Keyring.Rotate", &structs.KeyringRotateRootKeyRequest{ + PublishTime: key1.PublishTime + (time.Hour * 24).Nanoseconds(), + WriteRequest: structs.WriteRequest{Region: srv.Region()}, + }, resp)) + key2 := resp.Key + + // Eval again with time unchanged + c.snap, _ = store.Snapshot() + rotated, err = c.rootKeyRotate(eval, now) + + iter, err = store.RootKeyMetas(nil) + must.NoError(t, err) + for raw := iter.Next(); raw != nil; raw = iter.Next() { + k := raw.(*structs.RootKeyMeta) + switch k.KeyID { + case key0.KeyID: + must.True(t, k.IsActive(), must.Sprint("original key should still be active")) + case key1.KeyID, key2.KeyID: + must.True(t, k.IsPrepublished(), must.Sprint("new key should be prepublished")) + default: + t.Fatalf("should not have created any new keys: %#v", k) + } + } + + // Eval again with time after publish time + c.snap, _ = store.Snapshot() + now = time.Unix(0, key1.PublishTime+(time.Minute*10).Nanoseconds()) + rotated, err = c.rootKeyRotate(eval, now) + + iter, err = store.RootKeyMetas(nil) + must.NoError(t, err) + for raw := iter.Next(); raw != nil; raw = iter.Next() { + k := raw.(*structs.RootKeyMeta) + switch k.KeyID { + case key0.KeyID: + must.True(t, k.IsInactive(), must.Sprint("original key should be inactive")) + case key1.KeyID: + must.True(t, k.IsActive(), must.Sprint("prepublished key should now be active")) + case key2.KeyID: + must.True(t, k.IsPrepublished(), must.Sprint("later prepublished key should still be prepublished")) + default: + t.Fatalf("should not have created any new keys: %#v", k) + } + } +} + // TestCoreScheduler_RootKeyGC exercises root key GC func TestCoreScheduler_RootKeyGC(t *testing.T) { ci.Parallel(t) - srv, cleanup := TestServer(t, nil) + srv, cleanup := TestServer(t, func(c *Config) { + c.NumSchedulers = 0 + c.RootKeyRotationThreshold = time.Hour + c.RootKeyGCThreshold = time.Minute * 10 + }) defer cleanup() testutil.WaitForKeyring(t, srv.RPC, "global") - // reset the time table - srv.fsm.timetable.table = make([]TimeTableEntry, 1, 10) - // active key, will never be GC'd store := srv.fsm.State() key0, err := store.GetActiveRootKeyMeta(nil) - require.NotNil(t, key0, "expected keyring to be bootstapped") - require.NoError(t, err) + must.NotNil(t, key0, must.Sprint("expected keyring to be bootstapped")) + must.NoError(t, err) + + now := key0.CreateTime + yesterday := now - (24 * time.Hour).Nanoseconds() // insert an "old" inactive key - key1 := structs.NewRootKeyMeta() - key1.SetInactive() - require.NoError(t, store.UpsertRootKeyMeta(600, key1, false)) + key1 := structs.NewRootKeyMeta().MakeInactive() + key1.CreateTime = yesterday + must.NoError(t, store.UpsertRootKeyMeta(600, key1, false)) // insert an "old" and inactive key with a variable that's using it - key2 := structs.NewRootKeyMeta() - key2.SetInactive() - require.NoError(t, store.UpsertRootKeyMeta(700, key2, false)) + key2 := structs.NewRootKeyMeta().MakeInactive() + key2.CreateTime = yesterday + must.NoError(t, store.UpsertRootKeyMeta(700, key2, false)) variable := mock.VariableEncrypted() variable.KeyID = key2.KeyID @@ -2643,89 +2749,95 @@ func TestCoreScheduler_RootKeyGC(t *testing.T) { Op: structs.VarOpSet, Var: variable, }) - require.NoError(t, setResp.Error) + must.NoError(t, setResp.Error) // insert an "old" key that's inactive but being used by an alloc - key3 := structs.NewRootKeyMeta() - key3.SetInactive() - require.NoError(t, store.UpsertRootKeyMeta(800, key3, false)) + key3 := structs.NewRootKeyMeta().MakeInactive() + key3.CreateTime = yesterday + must.NoError(t, store.UpsertRootKeyMeta(800, key3, false)) // insert the allocation using key3 alloc := mock.Alloc() alloc.ClientStatus = structs.AllocClientStatusRunning alloc.SigningKeyID = key3.KeyID - require.NoError(t, store.UpsertAllocs( + must.NoError(t, store.UpsertAllocs( structs.MsgTypeTestSetup, 850, []*structs.Allocation{alloc})) // insert an "old" key that's inactive but being used by an alloc - key4 := structs.NewRootKeyMeta() - key4.SetInactive() - require.NoError(t, store.UpsertRootKeyMeta(900, key4, false)) + key4 := structs.NewRootKeyMeta().MakeInactive() + key4.CreateTime = yesterday + must.NoError(t, store.UpsertRootKeyMeta(900, key4, false)) // insert the dead allocation using key4 alloc2 := mock.Alloc() alloc2.ClientStatus = structs.AllocClientStatusFailed alloc2.DesiredStatus = structs.AllocDesiredStatusStop alloc2.SigningKeyID = key4.KeyID - require.NoError(t, store.UpsertAllocs( + must.NoError(t, store.UpsertAllocs( structs.MsgTypeTestSetup, 950, []*structs.Allocation{alloc2})) - // insert a time table index before the last key - tt := srv.fsm.TimeTable() - tt.Witness(1000, time.Now().UTC().Add(-1*srv.config.RootKeyGCThreshold)) + // insert an inactive key older than RootKeyGCThreshold but not RootKeyRotationThreshold + key5 := structs.NewRootKeyMeta().MakeInactive() + key5.CreateTime = now - (15 * time.Minute).Nanoseconds() + must.NoError(t, store.UpsertRootKeyMeta(1500, key5, false)) - // insert a "new" but inactive key - key5 := structs.NewRootKeyMeta() - key5.SetInactive() - require.NoError(t, store.UpsertRootKeyMeta(1500, key5, false)) + // prepublishing key should never be GC'd no matter how old + key6 := structs.NewRootKeyMeta().MakePrepublished(yesterday) + key6.CreateTime = yesterday + must.NoError(t, store.UpsertRootKeyMeta(1600, key6, false)) // run the core job snap, err := store.Snapshot() - require.NoError(t, err) + must.NoError(t, err) core := NewCoreScheduler(srv, snap) eval := srv.coreJobEval(structs.CoreJobRootKeyRotateOrGC, 2000) c := core.(*CoreScheduler) - require.NoError(t, c.rootKeyRotateOrGC(eval)) + must.NoError(t, c.rootKeyGC(eval, time.Now())) ws := memdb.NewWatchSet() key, err := store.RootKeyMetaByID(ws, key0.KeyID) - require.NoError(t, err) - require.NotNil(t, key, "active key should not have been GCd") + must.NoError(t, err) + must.NotNil(t, key, must.Sprint("active key should not have been GCd")) key, err = store.RootKeyMetaByID(ws, key1.KeyID) - require.NoError(t, err) - require.Nil(t, key, "old and unused inactive key should have been GCd") + must.NoError(t, err) + must.Nil(t, key, must.Sprint("old and unused inactive key should have been GCd")) key, err = store.RootKeyMetaByID(ws, key2.KeyID) - require.NoError(t, err) - require.NotNil(t, key, "old key should not have been GCd if still in use") + must.NoError(t, err) + must.NotNil(t, key, must.Sprint("old key should not have been GCd if still in use")) key, err = store.RootKeyMetaByID(ws, key3.KeyID) - require.NoError(t, err) - require.NotNil(t, key, "old key used to sign a live alloc should not have been GCd") + must.NoError(t, err) + must.NotNil(t, key, must.Sprint("old key used to sign a live alloc should not have been GCd")) key, err = store.RootKeyMetaByID(ws, key4.KeyID) - require.NoError(t, err) - require.Nil(t, key, "old key used to sign a terminal alloc should have been GCd") + must.NoError(t, err) + must.Nil(t, key, must.Sprint("old key used to sign a terminal alloc should have been GCd")) key, err = store.RootKeyMetaByID(ws, key5.KeyID) - require.NoError(t, err) - require.NotNil(t, key, "new key should not have been GCd") + must.NoError(t, err) + must.NotNil(t, key, must.Sprint("key newer than GC+rotation threshold should not have been GCd")) + key, err = store.RootKeyMetaByID(ws, key6.KeyID) + must.NoError(t, err) + must.NotNil(t, key, must.Sprint("prepublishing key should not have been GCd")) } // TestCoreScheduler_VariablesRekey exercises variables rekeying func TestCoreScheduler_VariablesRekey(t *testing.T) { ci.Parallel(t) - srv, cleanup := TestServer(t, nil) + srv, cleanup := TestServer(t, func(c *Config) { + c.NumSchedulers = 1 + }) defer cleanup() testutil.WaitForKeyring(t, srv.RPC, "global") store := srv.fsm.State() key0, err := store.GetActiveRootKeyMeta(nil) - require.NotNil(t, key0, "expected keyring to be bootstapped") - require.NoError(t, err) + must.NotNil(t, key0, must.Sprint("expected keyring to be bootstapped")) + must.NoError(t, err) for i := 0; i < 3; i++ { req := &structs.VariablesApplyRequest{ @@ -2734,7 +2846,7 @@ func TestCoreScheduler_VariablesRekey(t *testing.T) { WriteRequest: structs.WriteRequest{Region: srv.config.Region}, } resp := &structs.VariablesApplyResponse{} - require.NoError(t, srv.RPC("Variables.Apply", req, resp)) + must.NoError(t, srv.RPC("Variables.Apply", req, resp)) } rotateReq := &structs.KeyringRotateRootKeyRequest{ @@ -2743,7 +2855,7 @@ func TestCoreScheduler_VariablesRekey(t *testing.T) { }, } var rotateResp structs.KeyringRotateRootKeyResponse - require.NoError(t, srv.RPC("Keyring.Rotate", rotateReq, &rotateResp)) + must.NoError(t, srv.RPC("Keyring.Rotate", rotateReq, &rotateResp)) for i := 0; i < 3; i++ { req := &structs.VariablesApplyRequest{ @@ -2752,31 +2864,29 @@ func TestCoreScheduler_VariablesRekey(t *testing.T) { WriteRequest: structs.WriteRequest{Region: srv.config.Region}, } resp := &structs.VariablesApplyResponse{} - require.NoError(t, srv.RPC("Variables.Apply", req, resp)) + must.NoError(t, srv.RPC("Variables.Apply", req, resp)) } rotateReq.Full = true - require.NoError(t, srv.RPC("Keyring.Rotate", rotateReq, &rotateResp)) + must.NoError(t, srv.RPC("Keyring.Rotate", rotateReq, &rotateResp)) newKeyID := rotateResp.Key.KeyID - require.Eventually(t, func() bool { - ws := memdb.NewWatchSet() - iter, err := store.Variables(ws) - require.NoError(t, err) - for { - raw := iter.Next() - if raw == nil { - break + must.Wait(t, wait.InitialSuccess( + wait.Timeout(5*time.Second), + wait.Gap(100*time.Millisecond), + wait.BoolFunc(func() bool { + iter, _ := store.Variables(nil) + for raw := iter.Next(); raw != nil; raw = iter.Next() { + variable := raw.(*structs.VariableEncrypted) + if variable.KeyID != newKeyID { + return false + } } - variable := raw.(*structs.VariableEncrypted) - if variable.KeyID != newKeyID { - return false - } - } - return true - }, time.Second*5, 100*time.Millisecond, - "variable rekey should be complete") + originalKey, _ := store.RootKeyMetaByID(nil, key0.KeyID) + return originalKey.IsInactive() + }), + ), must.Sprint("variable rekey should be complete")) } func TestCoreScheduler_FailLoop(t *testing.T) { diff --git a/nomad/encrypter.go b/nomad/encrypter.go index f39a2855bd7..6b23bb9a765 100644 --- a/nomad/encrypter.go +++ b/nomad/encrypter.go @@ -23,9 +23,7 @@ import ( "github.com/go-jose/go-jose/v3" "github.com/go-jose/go-jose/v3/jwt" "github.com/hashicorp/go-hclog" - log "github.com/hashicorp/go-hclog" kms "github.com/hashicorp/go-kms-wrapping/v2" - wrapping "github.com/hashicorp/go-kms-wrapping/v2" "github.com/hashicorp/go-kms-wrapping/v2/aead" "github.com/hashicorp/go-kms-wrapping/wrappers/awskms/v2" "github.com/hashicorp/go-kms-wrapping/wrappers/azurekeyvault/v2" @@ -554,7 +552,7 @@ func (e *Encrypter) loadKeyFromStore(path string) (*structs.RootKey, error) { if wrappedDEK == nil { // older KEK wrapper versions with AEAD-only have the key material in a // different field - wrappedDEK = &wrapping.BlobInfo{Ciphertext: kekWrapper.EncryptedDataEncryptionKey} + wrappedDEK = &kms.BlobInfo{Ciphertext: kekWrapper.EncryptedDataEncryptionKey} } key, err := wrapper.Decrypt(e.srv.shutdownCtx, wrappedDEK) if err != nil { @@ -573,7 +571,7 @@ func (e *Encrypter) loadKeyFromStore(path string) (*structs.RootKey, error) { } else if len(kekWrapper.EncryptedRSAKey) > 0 { // older KEK wrapper versions with AEAD-only have the key material in a // different field - rsaKey, err = wrapper.Decrypt(e.srv.shutdownCtx, &wrapping.BlobInfo{ + rsaKey, err = wrapper.Decrypt(e.srv.shutdownCtx, &kms.BlobInfo{ Ciphertext: kekWrapper.EncryptedRSAKey}) if err != nil { return nil, fmt.Errorf("%w (rsa key): %w", ErrDecryptFailed, err) @@ -652,7 +650,7 @@ func (e *Encrypter) newKMSWrapper(provider *structs.KEKProviderConfig, keyID str config, ok := e.providerConfigs[provider.ID()] if ok { - _, err := wrapper.SetConfig(context.Background(), wrapping.WithConfigMap(config.Config)) + _, err := wrapper.SetConfig(context.Background(), kms.WithConfigMap(config.Config)) if err != nil { return nil, err } @@ -663,7 +661,7 @@ func (e *Encrypter) newKMSWrapper(provider *structs.KEKProviderConfig, keyID str type KeyringReplicator struct { srv *Server encrypter *Encrypter - logger log.Logger + logger hclog.Logger stopFn context.CancelFunc } diff --git a/nomad/encrypter_test.go b/nomad/encrypter_test.go index e3227daf325..092e136bd1e 100644 --- a/nomad/encrypter_test.go +++ b/nomad/encrypter_test.go @@ -568,7 +568,7 @@ func TestEncrypter_Upgrade17(t *testing.T) { oldRootKey, err := structs.NewRootKey(structs.EncryptionAlgorithmAES256GCM) must.NoError(t, err) - oldRootKey.Meta.SetActive() + oldRootKey = oldRootKey.MakeActive() // Remove RSAKey to mimic 1.6 oldRootKey.RSAKey = nil diff --git a/nomad/keyring_endpoint.go b/nomad/keyring_endpoint.go index 95302b63916..9a383b1c5cb 100644 --- a/nomad/keyring_endpoint.go +++ b/nomad/keyring_endpoint.go @@ -50,13 +50,20 @@ func (k *Keyring) Rotate(args *structs.KeyringRotateRootKeyRequest, reply *struc if args.Algorithm == "" { args.Algorithm = structs.EncryptionAlgorithmAES256GCM } + if args.Full && args.PublishTime > 0 { + return fmt.Errorf("keyring cannot be prepublished and full rotated at the same time") + } rootKey, err := structs.NewRootKey(args.Algorithm) if err != nil { return err } - rootKey.Meta.SetActive() + if args.PublishTime != 0 { + rootKey.Meta = rootKey.Meta.MakePrepublished(args.PublishTime) + } else { + rootKey.Meta = rootKey.Meta.MakeActive() + } // make sure it's been added to the local keystore before we write // it to raft, so that followers don't try to Get a key that @@ -329,7 +336,7 @@ func (k *Keyring) Delete(args *structs.KeyringDeleteRootKeyRequest, reply *struc if keyMeta == nil { return nil // safe to bail out early } - if keyMeta.Active() { + if keyMeta.IsActive() { return fmt.Errorf("active root key cannot be deleted - call rotate first") } diff --git a/nomad/keyring_endpoint_test.go b/nomad/keyring_endpoint_test.go index e0573fad840..7aa999e97f9 100644 --- a/nomad/keyring_endpoint_test.go +++ b/nomad/keyring_endpoint_test.go @@ -32,7 +32,7 @@ func TestKeyringEndpoint_CRUD(t *testing.T) { key, err := structs.NewRootKey(structs.EncryptionAlgorithmAES256GCM) require.NoError(t, err) id := key.Meta.KeyID - key.Meta.SetActive() + key = key.MakeActive() updateReq := &structs.KeyringUpdateRootKeyRequest{ RootKey: key, @@ -106,7 +106,7 @@ func TestKeyringEndpoint_CRUD(t *testing.T) { require.EqualError(t, err, "active root key cannot be deleted - call rotate first") // set inactive - updateReq.RootKey.Meta.SetInactive() + updateReq.RootKey.Meta = updateReq.RootKey.Meta.MakeInactive() err = msgpackrpc.CallWithCodec(codec, "Keyring.Update", updateReq, &updateResp) require.NoError(t, err) @@ -142,7 +142,7 @@ func TestKeyringEndpoint_InvalidUpdates(t *testing.T) { key, err := structs.NewRootKey(structs.EncryptionAlgorithmAES256GCM) require.NoError(t, err) id := key.Meta.KeyID - key.Meta.SetActive() + key = key.MakeActive() updateReq := &structs.KeyringUpdateRootKeyRequest{ RootKey: key, @@ -228,10 +228,14 @@ func TestKeyringEndpoint_Rotate(t *testing.T) { testutil.WaitForKeyring(t, srv.RPC, "global") codec := rpcClient(t, srv) + store := srv.fsm.State() + key0, err := store.GetActiveRootKeyMeta(nil) + must.NoError(t, err) + // Setup an existing key key, err := structs.NewRootKey(structs.EncryptionAlgorithmAES256GCM) - require.NoError(t, err) - key.Meta.SetActive() + must.NoError(t, err) + key1 := key.Meta updateReq := &structs.KeyringUpdateRootKeyRequest{ RootKey: key, @@ -242,7 +246,7 @@ func TestKeyringEndpoint_Rotate(t *testing.T) { } var updateResp structs.KeyringUpdateRootKeyResponse err = msgpackrpc.CallWithCodec(codec, "Keyring.Update", updateReq, &updateResp) - require.NoError(t, err) + must.NoError(t, err) // Rotate the key @@ -253,14 +257,13 @@ func TestKeyringEndpoint_Rotate(t *testing.T) { } var rotateResp structs.KeyringRotateRootKeyResponse err = msgpackrpc.CallWithCodec(codec, "Keyring.Rotate", rotateReq, &rotateResp) - require.EqualError(t, err, structs.ErrPermissionDenied.Error()) + must.EqError(t, err, structs.ErrPermissionDenied.Error()) rotateReq.AuthToken = rootToken.SecretID err = msgpackrpc.CallWithCodec(codec, "Keyring.Rotate", rotateReq, &rotateResp) - require.NoError(t, err) - require.NotEqual(t, updateResp.Index, rotateResp.Index) - - newID := rotateResp.Key.KeyID + must.NoError(t, err) + must.Greater(t, updateResp.Index, rotateResp.Index) + key2 := rotateResp.Key // Verify we have a new key and the old one is inactive @@ -272,31 +275,62 @@ func TestKeyringEndpoint_Rotate(t *testing.T) { } var listResp structs.KeyringListRootKeyMetaResponse err = msgpackrpc.CallWithCodec(codec, "Keyring.List", listReq, &listResp) - require.NoError(t, err) - - require.Greater(t, listResp.Index, updateResp.Index) - require.Len(t, listResp.Keys, 3) // bootstrap + old + new + must.NoError(t, err) + must.Greater(t, updateResp.Index, listResp.Index) + must.Len(t, 3, listResp.Keys) // bootstrap + old + new for _, keyMeta := range listResp.Keys { - if keyMeta.KeyID != newID { - require.False(t, keyMeta.Active(), "expected old keys to be inactive") - } else { - require.True(t, keyMeta.Active(), "expected new key to be inactive") + switch keyMeta.KeyID { + case key0.KeyID, key1.KeyID: + must.True(t, keyMeta.IsInactive(), must.Sprint("older keys must be inactive")) + case key2.KeyID: + must.True(t, keyMeta.IsActive(), must.Sprint("expected new key to be active")) } } getReq := &structs.KeyringGetRootKeyRequest{ - KeyID: newID, + KeyID: key2.KeyID, QueryOptions: structs.QueryOptions{ Region: "global", }, } var getResp structs.KeyringGetRootKeyResponse err = msgpackrpc.CallWithCodec(codec, "Keyring.Get", getReq, &getResp) - require.NoError(t, err) + must.NoError(t, err) + must.Len(t, 32, getResp.Key.Key) + + // Rotate the key with prepublishing + + publishTime := time.Now().Add(24 * time.Hour).UnixNano() + rotateResp = structs.KeyringRotateRootKeyResponse{} + rotateReq = &structs.KeyringRotateRootKeyRequest{ + PublishTime: publishTime, + WriteRequest: structs.WriteRequest{ + Region: "global", + AuthToken: rootToken.SecretID, + }, + } + err = msgpackrpc.CallWithCodec(codec, "Keyring.Rotate", rotateReq, &rotateResp) + must.NoError(t, err) + must.Greater(t, updateResp.Index, rotateResp.Index) + key3 := rotateResp.Key - gotKey := getResp.Key - require.Len(t, gotKey.Key, 32) + listResp = structs.KeyringListRootKeyMetaResponse{} + err = msgpackrpc.CallWithCodec(codec, "Keyring.List", listReq, &listResp) + must.NoError(t, err) + must.Greater(t, updateResp.Index, listResp.Index) + must.Len(t, 4, listResp.Keys) // bootstrap + old + new + prepublished + + for _, keyMeta := range listResp.Keys { + switch keyMeta.KeyID { + case key0.KeyID, key1.KeyID: + must.True(t, keyMeta.IsInactive(), must.Sprint("older keys must be inactive")) + case key2.KeyID: + must.True(t, keyMeta.IsActive(), must.Sprint("expected active key to remain active")) + case key3.KeyID: + must.True(t, keyMeta.IsPrepublished(), must.Sprint("expected new key to be prepublished")) + } + } } // TestKeyringEndpoint_ListPublic asserts the Keyring.ListPublic RPC returns diff --git a/nomad/leader.go b/nomad/leader.go index fc5a0b09d40..d771a3cd160 100644 --- a/nomad/leader.go +++ b/nomad/leader.go @@ -2760,7 +2760,7 @@ func (s *Server) initializeKeyring(stopCh <-chan struct{}) { logger.Trace("initializing keyring") rootKey, err := structs.NewRootKey(structs.EncryptionAlgorithmAES256GCM) - rootKey.Meta.SetActive() + rootKey = rootKey.MakeActive() if err != nil { logger.Error("could not initialize keyring: %v", err) return diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go index acd4c782c28..6fce8574da4 100644 --- a/nomad/state/state_store.go +++ b/nomad/state/state_store.go @@ -7340,10 +7340,10 @@ func (s *StateStore) UpsertRootKeyMeta(index uint64, rootKeyMeta *structs.RootKe existing := raw.(*structs.RootKeyMeta) rootKeyMeta.CreateIndex = existing.CreateIndex rootKeyMeta.CreateTime = existing.CreateTime - isRotation = !existing.Active() && rootKeyMeta.Active() + isRotation = !existing.IsActive() && rootKeyMeta.IsActive() } else { rootKeyMeta.CreateIndex = index - isRotation = rootKeyMeta.Active() + isRotation = rootKeyMeta.IsActive() } rootKeyMeta.ModifyIndex = index @@ -7369,14 +7369,14 @@ func (s *StateStore) UpsertRootKeyMeta(index uint64, rootKeyMeta *structs.RootKe switch key.State { case structs.RootKeyStateInactive: if rekey { - key.SetRekeying() + key = key.MakeRekeying() modified = true } case structs.RootKeyStateActive: if rekey { - key.SetRekeying() + key = key.MakeRekeying() } else { - key.SetInactive() + key = key.MakeInactive() } modified = true case structs.RootKeyStateRekeying, structs.RootKeyStateDeprecated: @@ -7475,7 +7475,7 @@ func (s *StateStore) GetActiveRootKeyMeta(ws memdb.WatchSet) (*structs.RootKeyMe break } key := raw.(*structs.RootKeyMeta) - if key.Active() { + if key.IsActive() { return key, nil } } diff --git a/nomad/state/state_store_test.go b/nomad/state/state_store_test.go index 04e0a0d4c1f..7e4243296f2 100644 --- a/nomad/state/state_store_test.go +++ b/nomad/state/state_store_test.go @@ -10776,7 +10776,7 @@ func TestStateStore_RootKeyMetaData_CRUD(t *testing.T) { key := structs.NewRootKeyMeta() keyIDs = append(keyIDs, key.KeyID) if i == 0 { - key.SetActive() + key = key.MakeActive() } index++ require.NoError(t, store.UpsertRootKeyMeta(index, key, false)) @@ -10792,8 +10792,7 @@ func TestStateStore_RootKeyMetaData_CRUD(t *testing.T) { require.NoError(t, err) require.NotNil(t, inactiveKey) oldCreateIndex := inactiveKey.CreateIndex - newlyActiveKey := inactiveKey.Copy() - newlyActiveKey.SetActive() + newlyActiveKey := inactiveKey.MakeActive() index++ require.NoError(t, store.UpsertRootKeyMeta(index, newlyActiveKey, false)) @@ -10806,10 +10805,10 @@ func TestStateStore_RootKeyMetaData_CRUD(t *testing.T) { } key := raw.(*structs.RootKeyMeta) if key.KeyID == newlyActiveKey.KeyID { - require.True(t, key.Active(), "expected updated key to be active") + require.True(t, key.IsActive(), "expected updated key to be active") require.Equal(t, oldCreateIndex, key.CreateIndex) } else { - require.False(t, key.Active(), "expected other keys to be inactive") + require.False(t, key.IsActive(), "expected other keys to be inactive") } } @@ -10827,7 +10826,7 @@ func TestStateStore_RootKeyMetaData_CRUD(t *testing.T) { } key := raw.(*structs.RootKeyMeta) require.NotEqual(t, keyIDs[1], key.KeyID) - require.False(t, key.Active(), "expected remaining keys to be inactive") + require.False(t, key.IsActive(), "expected remaining keys to be inactive") found++ } require.Equal(t, 2, found, "expected only 2 keys remaining") diff --git a/nomad/structs/keyring.go b/nomad/structs/keyring.go index 67379d89a9b..64a47798d40 100644 --- a/nomad/structs/keyring.go +++ b/nomad/structs/keyring.go @@ -11,6 +11,7 @@ import ( "fmt" "maps" "net/url" + "slices" "time" "github.com/go-jose/go-jose/v3" @@ -77,6 +78,33 @@ func NewRootKey(algorithm EncryptionAlgorithm) (*RootKey, error) { return rootKey, nil } +func (k *RootKey) Copy() *RootKey { + return &RootKey{ + Meta: k.Meta.Copy(), + Key: slices.Clone(k.Key), + RSAKey: slices.Clone(k.RSAKey), + } +} + +// MakeInactive returns a copy of the RootKey with the meta state set to active +func (k *RootKey) MakeActive() *RootKey { + return &RootKey{ + Meta: k.Meta.MakeActive(), + Key: slices.Clone(k.Key), + RSAKey: slices.Clone(k.RSAKey), + } +} + +// MakeInactive returns a copy of the RootKey with the meta state set to +// inactive +func (k *RootKey) MakeInactive() *RootKey { + return &RootKey{ + Meta: k.Meta.MakeInactive(), + Key: slices.Clone(k.Key), + RSAKey: slices.Clone(k.RSAKey), + } +} + // RootKeyMeta is the metadata used to refer to a RootKey. It is // stored in raft. type RootKeyMeta struct { @@ -86,6 +114,7 @@ type RootKeyMeta struct { CreateIndex uint64 ModifyIndex uint64 State RootKeyState + PublishTime int64 } // KEKProviderName enum are the built-in KEK providers. @@ -145,9 +174,10 @@ func (c *KEKProviderConfig) ID() string { type RootKeyState string const ( - RootKeyStateInactive RootKeyState = "inactive" - RootKeyStateActive = "active" - RootKeyStateRekeying = "rekeying" + RootKeyStateInactive RootKeyState = "inactive" + RootKeyStateActive = "active" + RootKeyStateRekeying = "rekeying" + RootKeyStatePrepublished = "prepublished" // RootKeyStateDeprecated is, itself, deprecated and is no longer in // use. For backwards compatibility, any existing keys with this state will @@ -177,33 +207,66 @@ type RootKeyMetaStub struct { State RootKeyState } -// Active indicates his key is the one currently being used for -// crypto operations (at most one key can be Active) -func (rkm *RootKeyMeta) Active() bool { +// IsActive indicates this key is the one currently being used for crypto +// operations (at most one key can be Active) +func (rkm *RootKeyMeta) IsActive() bool { return rkm.State == RootKeyStateActive } -func (rkm *RootKeyMeta) SetActive() { - rkm.State = RootKeyStateActive +// MakeActive returns a copy of the RootKeyMeta with the state set to active +func (rkm *RootKeyMeta) MakeActive() *RootKeyMeta { + out := rkm.Copy() + if out != nil { + out.State = RootKeyStateActive + out.PublishTime = 0 + } + return out } -// Rekeying indicates that variables encrypted with this key should be +// IsRekeying indicates that variables encrypted with this key should be // rekeyed -func (rkm *RootKeyMeta) Rekeying() bool { +func (rkm *RootKeyMeta) IsRekeying() bool { return rkm.State == RootKeyStateRekeying } -func (rkm *RootKeyMeta) SetRekeying() { - rkm.State = RootKeyStateRekeying +// MakeRekeying returns a copy of the RootKeyMeta with the state set to rekeying +func (rkm *RootKeyMeta) MakeRekeying() *RootKeyMeta { + out := rkm.Copy() + if out != nil { + out.State = RootKeyStateRekeying + } + return out +} + +// MakePrepublished returns a copy of the RootKeyMeta with the state set to +// prepublished at the time t +func (rkm *RootKeyMeta) MakePrepublished(t int64) *RootKeyMeta { + out := rkm.Copy() + if out != nil { + out.PublishTime = t + out.State = RootKeyStatePrepublished + } + return out } -func (rkm *RootKeyMeta) SetInactive() { - rkm.State = RootKeyStateInactive +// IsPrepublished indicates that this key has been published and is pending +// being promoted to active +func (rkm *RootKeyMeta) IsPrepublished() bool { + return rkm.State == RootKeyStatePrepublished } -// Inactive indicates that this key is no longer being used to encrypt new +// MakeInactive returns a copy of the RootKeyMeta with the state set to inactive +func (rkm *RootKeyMeta) MakeInactive() *RootKeyMeta { + out := rkm.Copy() + if out != nil { + out.State = RootKeyStateInactive + } + return out +} + +// IsInactive indicates that this key is no longer being used to encrypt new // variables or workload identities. -func (rkm *RootKeyMeta) Inactive() bool { +func (rkm *RootKeyMeta) IsInactive() bool { return rkm.State == RootKeyStateInactive || rkm.State == RootKeyStateDeprecated } @@ -239,7 +302,7 @@ func (rkm *RootKeyMeta) Validate() error { } switch rkm.State { case RootKeyStateInactive, RootKeyStateActive, - RootKeyStateRekeying, RootKeyStateDeprecated: + RootKeyStateRekeying, RootKeyStateDeprecated, RootKeyStatePrepublished: default: return fmt.Errorf("root key state %q is invalid", rkm.State) } @@ -277,8 +340,9 @@ const ( // KeyringRotateRootKeyRequest is the argument to the Keyring.Rotate RPC type KeyringRotateRootKeyRequest struct { - Algorithm EncryptionAlgorithm - Full bool + Algorithm EncryptionAlgorithm + Full bool + PublishTime int64 WriteRequest } diff --git a/website/content/docs/commands/operator/root/keyring-rotate.mdx b/website/content/docs/commands/operator/root/keyring-rotate.mdx index 2ba424ef241..aef49c6265a 100644 --- a/website/content/docs/commands/operator/root/keyring-rotate.mdx +++ b/website/content/docs/commands/operator/root/keyring-rotate.mdx @@ -25,19 +25,34 @@ nomad operator root keyring rotate [options] ## Rotate Options - `-full`: Decrypt all existing variables and re-encrypt with the new key. This - command will immediately return and the re-encryption process will run - asynchronously on the leader. + command will immediately return and the re-encryption process will run + asynchronously on the leader. + +- `-now`: Publish the new key immediately without prepublishing. One of `-now` + or `-prepublish` must be set. + +- `-prepublish`: Set a duration for which to prepublish the new key + (ex. "1h"). The currently active key will be unchanged but the new public key + will be available in the JWKS endpoint. Multiple keys can be prepublished and + they will be promoted to active in order of publish time, at most once every + [`root_key_gc_interval`][]. One of `-now` or `-prepublish` must be set. - `-verbose`: Enable verbose output ## Examples ```shell-session -$ nomad operator root keyring rotate -Key State Create Time -f19f6029 active 2022-07-11T19:14:36Z +$ nomad operator root keyring rotate -now +Key State Create Time Publish Time +f19f6029 active 2022-07-11T19:14:36Z -$ nomad operator root keyring rotate -verbose -Key State Create Time -53186ac1-9002-c4b6-216d-bb19fd37a791 active 2022-07-11T19:14:47Z +$ nomad operator root keyring rotate -now -verbose +Key State Create Time Publish Time +53186ac1-9002-c4b6-216d-bb19fd37a791 active 2022-07-11T19:14:47Z + +$ nomad operator root keyring rotate -prepublish 1h +Key State Create Time Publish Time +7f15e4e9 active 2022-07-11T19:15:10Z 2022-07-11T20:15:10Z ``` + +[`root_key_gc_interval`]: /nomad/docs/configuration/server#root_key_gc_interval diff --git a/website/content/docs/configuration/server.mdx b/website/content/docs/configuration/server.mdx index 3bde9320772..a1a4fa113d9 100644 --- a/website/content/docs/configuration/server.mdx +++ b/website/content/docs/configuration/server.mdx @@ -236,13 +236,16 @@ server { - `root_key_gc_interval` `(string: "10m")` - Specifies the interval between [encryption key][] metadata garbage collections. -- `root_key_gc_threshold` `(string: "1h")` - Specifies the minimum time that an - [encryption key][] must exist before it can be eligible for garbage - collection. +- `root_key_gc_threshold` `(string: "1h")` - Specifies the minimum time after + the `root_key_rotation_threshold` has passed that an [encryption key][] must + exist before it can be eligible for garbage collection. -- `root_key_rotation_threshold` `(string: "720h")` - Specifies the minimum time - that an [encryption key][] must exist before it is automatically rotated on - the next garbage collection interval. +- `root_key_rotation_threshold` `(string: "720h")` - Specifies the lifetime of + an active [encryption key][] before it is automatically rotated on the next + garbage collection interval. Nomad will prepublish the replacement key at half + the `root_key_rotation_threshold` time so external consumers of Workload + Identity have time to obtain the new public key from the [JWKS URL][] before + it is used. - `server_join` ([server_join][server-join]: nil) - Specifies how the Nomad server will connect to other Nomad servers. The `retry_join` @@ -517,3 +520,4 @@ work. [wi]: /nomad/docs/concepts/workload-identity [Configure for multiple regions]: /nomad/tutorials/access-control/access-control-bootstrap#configure-for-multiple-regions [top_level_data_dir]: /nomad/docs/configuration#data_dir +[JWKS URL]: /nomad/api-docs/operator/keyring#list-active-public-keys diff --git a/website/content/docs/integrations/consul/acl.mdx b/website/content/docs/integrations/consul/acl.mdx index 9499fe7fe2a..d5bc5d4b358 100644 --- a/website/content/docs/integrations/consul/acl.mdx +++ b/website/content/docs/integrations/consul/acl.mdx @@ -349,10 +349,10 @@ You may also host the JWKS JSON response from Nomad in an external location that is reachable by the Consul servers, and use that address as the value for `JWKSURL`. -It is important to remember that the Nomad keys **are rotated periodically**, -so both approaches should be automated and done continually. The rotation -frequency is controlled by the [`server.root_key_rotation_threshold`][] -configuration of the Nomad servers. +It is important to remember that the Nomad keys **are rotated periodically**, so +both approaches should be automated and done continually. The rotation frequency +is controlled by the [`server.root_key_rotation_threshold`][] configuration of +the Nomad servers. Keys will be prepublished at half the rotation threshold. ### Additional References diff --git a/website/content/docs/integrations/vault/acl.mdx b/website/content/docs/integrations/vault/acl.mdx index f6fe3d09929..13ea2586011 100644 --- a/website/content/docs/integrations/vault/acl.mdx +++ b/website/content/docs/integrations/vault/acl.mdx @@ -316,10 +316,10 @@ You may also host the JWKS JSON response from Nomad in an external location that is reachable by the Vault servers, and use that address as the value for `jwks_url`. -It is important to remember that the Nomad keys **are rotated periodically**, -so both approaches should be automated and done continually. The rotation -frequency is controlled by the [`server.root_key_rotation_threshold`][] -configuration of the Nomad servers. +It is important to remember that the Nomad keys **are rotated periodically**, so +both approaches should be automated and done continually. The rotation frequency +is controlled by the [`server.root_key_rotation_threshold`][] configuration of +the Nomad servers. Keys will be prepublished at half the rotation threshold. ### Additional References diff --git a/website/content/docs/upgrade/upgrade-specific.mdx b/website/content/docs/upgrade/upgrade-specific.mdx index 181e624e2dc..4e9467d2f92 100644 --- a/website/content/docs/upgrade/upgrade-specific.mdx +++ b/website/content/docs/upgrade/upgrade-specific.mdx @@ -13,7 +13,19 @@ upgrade. However, specific versions of Nomad may have more details provided for their upgrades as a result of new features or changed behavior. This page is used to document those details separately from the standard upgrade flow. -## Nomad 1.8.2 (UNRELEASED) +## Nomad 1.8.3 (UNRELEASED) + +#### Nomad keyring rotation + +In Nomad 1.8.3, the Nomad root keyring will prepublish keys at half the +`root_key_rotation_threshold` and promote them to active once the +`root_key_rotation_threshold` has passed. The `nomad operator root keyring +rotate` command now requires one of two arguments: `-prepublish ` to +prepublish a key or `-now` to rotate immediately. We recommend using +`-prepublish` to avoid outages from workload identities used to log into +external services such as Vault or Consul. + +## Nomad 1.8.2 #### New `windows_allow_insecure_container_admin` configuration option for Docker driver