Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

keyring: safely handle missing keys and restore GC #15092

Merged
merged 11 commits into from
Nov 1, 2022
2 changes: 1 addition & 1 deletion .changelog/15034.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
```release-note:bug
keyring: Removed root key garbage collection to avoid orphaned workload identities
```
```
7 changes: 7 additions & 0 deletions .changelog/15092.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
```release-note:bug
keyring: Fixed a bug where a missing key would prevent any further replication.
```

```release-note:bug
keyring: Re-enabled keyring garbage collection after fixing a bug where keys would be garbage collected even if they were used to sign a live allocation's workload identity.
```
103 changes: 93 additions & 10 deletions nomad/core_sched.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ func (c *CoreScheduler) Process(eval *structs.Evaluation) error {
case structs.CoreJobGlobalTokenExpiredGC:
return c.expiredACLTokenGC(eval, true)
case structs.CoreJobRootKeyRotateOrGC:
return c.rootKeyRotate(eval)
return c.rootKeyRotateOrGC(eval)
case structs.CoreJobVariablesRekey:
return c.variablesRekey(eval)
case structs.CoreJobForceGC:
Expand Down Expand Up @@ -96,7 +96,9 @@ func (c *CoreScheduler) forceGC(eval *structs.Evaluation) error {
if err := c.expiredACLTokenGC(eval, true); err != nil {
return err
}

if err := c.rootKeyGC(eval); err != nil {
return err
}
// Node GC must occur after the others to ensure the allocations are
// cleared.
return c.nodeGC(eval)
Expand Down Expand Up @@ -893,23 +895,93 @@ func (c *CoreScheduler) expiredACLTokenGC(eval *structs.Evaluation, global bool)
return c.srv.RPC(structs.ACLDeleteTokensRPCMethod, req, &structs.GenericResponse{})
}

// rootKeyRotateOrGC is used to rotate or garbage collect root keys
func (c *CoreScheduler) rootKeyRotateOrGC(eval *structs.Evaluation) error {

// a rotation will be sent to the leader so our view of state
// is no longer valid. we ack this core job and will pick up
// the GC work on the next interval
wasRotated, err := c.rootKeyRotate(eval)
if err != nil {
return err
}
if wasRotated {
return nil
}
return c.rootKeyGC(eval)
}

func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation) error {

oldThreshold := c.getThreshold(eval, "root key",
"root_key_gc_threshold", c.srv.config.RootKeyGCThreshold)

ws := memdb.NewWatchSet()
iter, err := c.snap.RootKeyMetas(ws)
if err != nil {
return err
}

for {
raw := iter.Next()
if raw == nil {
break
}
keyMeta := raw.(*structs.RootKeyMeta)
if keyMeta.Active() || keyMeta.Rekeying() {
continue // never GC the active key or one we're rekeying
}
if keyMeta.CreateIndex > oldThreshold {
continue // don't GC recent keys
}

// don't GC a key that's deprecated, that's encrypted a variable or
// signed a live workload identity
inUse := false
if !keyMeta.Deprecated() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like it will reach the delete case if keyMeta.Deprecated() returns true, which doesn't seem to jibe with the comment.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dropping from our sidebar discussion on Slack:

I think I've figured out why it exists, and we don't need it anymore:

In the original design, we didn't have a way of determining if the key was in use by a WI except by guessing (incorrectly, as it turned out) based on the indexes. So we marked it as deprecated b/c the GC that actually removed it could happen much later and having a status that let the user know we've done a rekey on it was maybe helpful for diagnostics.

In the new design, we can just not touch the key at all in the rekey pass. It'll already be marked inactive because the key was rotated. The next periodic GC will delete the key if it's not in use. Much simpler.

I've done that and I think this looks pretty good?

inUse, err = c.snap.IsRootKeyMetaInUse(keyMeta.KeyID)
if err != nil {
return err
}
}
if inUse {
continue
}

req := &structs.KeyringDeleteRootKeyRequest{
KeyID: keyMeta.KeyID,
WriteRequest: structs.WriteRequest{
Region: c.srv.config.Region,
AuthToken: eval.LeaderACL,
},
}
if err := c.srv.RPC("Keyring.Delete",
req, &structs.KeyringDeleteRootKeyResponse{}); err != nil {
c.logger.Error("root key delete failed", "error", err)
return err
}
}

return nil
}

// rootKeyRotate checks if the active key is old enough that we need
// to kick off a rotation.
func (c *CoreScheduler) rootKeyRotate(eval *structs.Evaluation) error {
func (c *CoreScheduler) rootKeyRotate(eval *structs.Evaluation) (bool, error) {

rotationThreshold := c.getThreshold(eval, "root key",
"root_key_rotation_threshold", c.srv.config.RootKeyRotationThreshold)

ws := memdb.NewWatchSet()
activeKey, err := c.snap.GetActiveRootKeyMeta(ws)
if err != nil {
return err
return false, err
}
if activeKey == nil {
return nil // no active key
return false, nil // no active key
}
if activeKey.CreateIndex >= rotationThreshold {
return nil // key is too new
return false, nil // key is too new
}

req := &structs.KeyringRotateRootKeyRequest{
Expand All @@ -921,10 +993,10 @@ func (c *CoreScheduler) rootKeyRotate(eval *structs.Evaluation) error {
if err := c.srv.RPC("Keyring.Rotate",
req, &structs.KeyringRotateRootKeyResponse{}); err != nil {
c.logger.Error("root key rotation failed", "error", err)
return err
return false, err
}

return nil
return true, nil
}

// variablesReKey is optionally run after rotating the active
Expand Down Expand Up @@ -958,9 +1030,20 @@ func (c *CoreScheduler) variablesRekey(eval *structs.Evaluation) error {
return err
}

// we've now rotated all this key's variables, so set its state
// we've rotated this key's variables, but we need to ensure it hasn't
// been used to sign any live workload identities before it's safe to
// mark as deprecated
inUse, err := c.snap.IsRootKeyMetaInUse(keyMeta.KeyID)
tgross marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
return err
}

keyMeta = keyMeta.Copy()
keyMeta.SetDeprecated()
if inUse {
keyMeta.SetInactive()
} else {
keyMeta.SetDeprecated()
}

key, err := c.srv.encrypter.GetKey(keyMeta.KeyID)
if err != nil {
Expand Down
86 changes: 69 additions & 17 deletions nomad/core_sched_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2466,38 +2466,90 @@ func TestCoreScheduler_RootKeyGC(t *testing.T) {
// reset the time table
srv.fsm.timetable.table = make([]TimeTableEntry, 1, 10)

// active key, will never be GC'd
store := srv.fsm.State()
key0, err := store.GetActiveRootKeyMeta(nil)
require.NotNil(t, key0, "expected keyring to be bootstapped")
require.NoError(t, err)

// insert an "old" deprecated key
key1 := structs.NewRootKeyMeta()
key1.SetDeprecated()
require.NoError(t, store.UpsertRootKeyMeta(500, key1, false))

// insert an "old" inactive key
key2 := structs.NewRootKeyMeta()
key2.SetDeprecated()
require.NoError(t, store.UpsertRootKeyMeta(600, key2, false))

// insert an "old" and inactive key with a variable that's using it
key3 := structs.NewRootKeyMeta()
key3.SetInactive()
require.NoError(t, store.UpsertRootKeyMeta(700, key3, false))

variable := mock.VariableEncrypted()
variable.KeyID = key3.KeyID

setResp := store.VarSet(601, &structs.VarApplyStateRequest{
Op: structs.VarOpSet,
Var: variable,
})
require.NoError(t, setResp.Error)

// insert an "old" key that's inactive but being used by an alloc
key4 := structs.NewRootKeyMeta()
key4.SetInactive()
require.NoError(t, store.UpsertRootKeyMeta(800, key4, false))

// insert the allocation using key3
alloc := mock.Alloc()
alloc.ClientStatus = structs.AllocClientStatusRunning
alloc.SigningKeyID = key4.KeyID
require.NoError(t, store.UpsertAllocs(
structs.MsgTypeTestSetup, 850, []*structs.Allocation{alloc}))

// insert a time table index before the last key
tt := srv.fsm.TimeTable()
tt.Witness(1000, time.Now().UTC().Add(-1*srv.config.RootKeyGCThreshold))

// insert a "new" but inactive key
key5 := structs.NewRootKeyMeta()
key5.SetInactive()
require.NoError(t, store.UpsertRootKeyMeta(1500, key5, false))

// run the core job
snap, err := store.Snapshot()
require.NoError(t, err)
core := NewCoreScheduler(srv, snap)
eval := srv.coreJobEval(structs.CoreJobRootKeyRotateOrGC, 2000)
c := core.(*CoreScheduler)
require.NoError(t, c.rootKeyRotate(eval))
require.NoError(t, c.rootKeyRotateOrGC(eval))

got, err := store.GetActiveRootKeyMeta(nil)
require.NotNil(t, got, "expected keyring to have an active key")
require.Equal(t, got.KeyID, key0.KeyID)
ws := memdb.NewWatchSet()
key, err := store.RootKeyMetaByID(ws, key0.KeyID)
require.NoError(t, err)
require.NotNil(t, key, "active key should not have been GCd")

// insert a time table index after the key
tt := srv.fsm.TimeTable()
tt.Witness(3000, time.Now().UTC().Add(-1*srv.config.RootKeyRotationThreshold))
key, err = store.RootKeyMetaByID(ws, key1.KeyID)
require.NoError(t, err)
require.Nil(t, key, "old deprecated key should have been GCd")

// re-run the core job
snap, err = store.Snapshot()
key, err = store.RootKeyMetaByID(ws, key2.KeyID)
require.NoError(t, err)
require.Nil(t, key, "old and unused inactive key should have been GCd")

key, err = store.RootKeyMetaByID(ws, key3.KeyID)
require.NoError(t, err)
require.NotNil(t, key, "old key should not have been GCd if still in use")

key, err = store.RootKeyMetaByID(ws, key4.KeyID)
require.NoError(t, err)
core = NewCoreScheduler(srv, snap)
eval = srv.coreJobEval(structs.CoreJobRootKeyRotateOrGC, 4000)
c = core.(*CoreScheduler)
require.NoError(t, c.rootKeyRotate(eval))

got, err = store.GetActiveRootKeyMeta(nil)
require.NotNil(t, got, "expected keyring to have an active key")
require.NotEqual(t, got.KeyID, key0.KeyID)
require.NotNil(t, key, "old key used to sign an alloc should not have been GCd")

key, err = store.RootKeyMetaByID(ws, key5.KeyID)
require.NoError(t, err)
require.NotNil(t, key, "new key should not have been GCd")

}

// TestCoreScheduler_VariablesRekey exercises variables rekeying
Expand Down
Loading