diff --git a/docs/generated/settings/settings-for-tenants.txt b/docs/generated/settings/settings-for-tenants.txt index 31cecd30b56a..8563e1c0cc75 100644 --- a/docs/generated/settings/settings-for-tenants.txt +++ b/docs/generated/settings/settings-for-tenants.txt @@ -52,6 +52,7 @@ server.auth_log.sql_connections.enabled boolean false if set, log SQL client con server.auth_log.sql_sessions.enabled boolean false if set, log SQL session login/disconnection events (note: may hinder performance on loaded nodes) tenant-rw server.authentication_cache.enabled boolean true enables a cache used during authentication to avoid lookups to system tables when retrieving per-user authentication-related information tenant-rw server.child_metrics.enabled boolean false enables the exporting of child metrics, additional prometheus time series with extra labels tenant-rw +server.client_cert_expiration_cache.capacity integer 1000 the maximum number of client cert expirations stored tenant-rw server.clock.forward_jump_check_enabled boolean false if enabled, forward clock jumps > max_offset/2 will cause a panic tenant-rw server.clock.persist_upper_bound_interval duration 0s the interval between persisting the wall time upper bound of the clock. The clock does not generate a wall time greater than the persisted timestamp and will panic if it sees a wall time greater than this value. When cockroach starts, it waits for the wall time to catch-up till this persisted timestamp. This guarantees monotonic wall time across server restarts. Not setting this or setting a value of 0 disables this feature. tenant-rw server.eventlog.enabled boolean true if set, logged notable events are also stored in the table system.eventlog tenant-rw diff --git a/docs/generated/settings/settings.html b/docs/generated/settings/settings.html index efc37fc73279..1d8a39ed285e 100644 --- a/docs/generated/settings/settings.html +++ b/docs/generated/settings/settings.html @@ -79,6 +79,7 @@
server.auth_log.sql_sessions.enabled
booleanfalseif set, log SQL session login/disconnection events (note: may hinder performance on loaded nodes)Serverless/Dedicated/Self-Hosted
server.authentication_cache.enabled
booleantrueenables a cache used during authentication to avoid lookups to system tables when retrieving per-user authentication-related informationServerless/Dedicated/Self-Hosted
server.child_metrics.enabled
booleanfalseenables the exporting of child metrics, additional prometheus time series with extra labelsServerless/Dedicated/Self-Hosted +
server.client_cert_expiration_cache.capacity
integer1000the maximum number of client cert expirations storedServerless/Dedicated/Self-Hosted
server.clock.forward_jump_check_enabled
booleanfalseif enabled, forward clock jumps > max_offset/2 will cause a panicServerless/Dedicated/Self-Hosted
server.clock.persist_upper_bound_interval
duration0sthe interval between persisting the wall time upper bound of the clock. The clock does not generate a wall time greater than the persisted timestamp and will panic if it sees a wall time greater than this value. When cockroach starts, it waits for the wall time to catch-up till this persisted timestamp. This guarantees monotonic wall time across server restarts. Not setting this or setting a value of 0 disables this feature.Serverless/Dedicated/Self-Hosted
server.consistency_check.max_rate
byte size8.0 MiBthe rate limit (bytes/sec) to use for consistency checks; used in conjunction with server.consistency_check.interval to control the frequency of consistency checks. Note that setting this too high can negatively impact performance.Dedicated/Self-Hosted diff --git a/pkg/security/BUILD.bazel b/pkg/security/BUILD.bazel index ae463214c161..bdfaedaa3b2a 100644 --- a/pkg/security/BUILD.bazel +++ b/pkg/security/BUILD.bazel @@ -6,8 +6,10 @@ go_library( srcs = [ "auth.go", "auto_tls_init.go", + "cert_expiry_cache.go", "certificate_loader.go", "certificate_manager.go", + "certificate_metrics.go", "certs.go", "join_token.go", "ocsp.go", @@ -31,11 +33,14 @@ go_library( "//pkg/server/telemetry", "//pkg/settings", "//pkg/settings/cluster", + "//pkg/util/cache", "//pkg/util/encoding", "//pkg/util/envutil", "//pkg/util/log", "//pkg/util/log/eventpb", "//pkg/util/metric", + "//pkg/util/metric/aggmetric", + "//pkg/util/mon", "//pkg/util/quotapool", "//pkg/util/randutil", "//pkg/util/stop", @@ -58,6 +63,7 @@ go_test( srcs = [ "auth_test.go", "auto_tls_init_test.go", + "cert_expiry_cache_test.go", "certificate_loader_test.go", "certificate_manager_test.go", "certs_rotation_test.go", @@ -81,12 +87,17 @@ go_test( "//pkg/security/securitytest", "//pkg/security/username", "//pkg/server", + "//pkg/settings/cluster", "//pkg/testutils", "//pkg/testutils/serverutils", "//pkg/util/envutil", "//pkg/util/leaktest", "//pkg/util/log", + "//pkg/util/metric", + "//pkg/util/metric/aggmetric", + "//pkg/util/mon", "//pkg/util/randutil", + "//pkg/util/stop", "//pkg/util/timeutil", "//pkg/util/uuid", "@com_github_cockroachdb_errors//:errors", diff --git a/pkg/security/auth.go b/pkg/security/auth.go index c586e41defd6..0f7ee0992e59 100644 --- a/pkg/security/auth.go +++ b/pkg/security/auth.go @@ -143,7 +143,11 @@ func Contains(sl []string, s string) bool { // UserAuthCertHook builds an authentication hook based on the security // mode and client certificate. func UserAuthCertHook( - insecureMode bool, tlsState *tls.ConnectionState, tenantID roachpb.TenantID, + insecureMode bool, + tlsState *tls.ConnectionState, + tenantID roachpb.TenantID, + certManager *CertificateManager, + cache *ClientCertExpirationCache, ) (UserAuthHook, error) { var certUserScope []CertificateUserScope if !insecureMode { @@ -177,14 +181,24 @@ func UserAuthCertHook( return nil } + peerCert := tlsState.PeerCertificates[0] + // The client certificate should not be a tenant client type. For now just // check that it doesn't have OU=Tenants. It would make sense to add // explicit OU=Users to all client certificates and to check for match. - if IsTenantCertificate(tlsState.PeerCertificates[0]) { + if IsTenantCertificate(peerCert) { return errors.Errorf("using tenant client certificate as user certificate is not allowed") } if ValidateUserScope(certUserScope, systemIdentity.Normalized(), tenantID) { + if certManager != nil { + cache.MaybeUpsert( + ctx, + systemIdentity.Normalized(), + peerCert.NotAfter.Unix(), + certManager.certMetrics.ClientExpiration, + ) + } return nil } return errors.WithDetailf(errors.Errorf("certificate authentication failed for user %q", systemIdentity), diff --git a/pkg/security/auth_test.go b/pkg/security/auth_test.go index b4de758a3410..62fd88323d0a 100644 --- a/pkg/security/auth_test.go +++ b/pkg/security/auth_test.go @@ -321,7 +321,13 @@ func TestAuthenticationHook(t *testing.T) { if err != nil { t.Fatal(err) } - hook, err := security.UserAuthCertHook(tc.insecure, makeFakeTLSState(t, tc.tlsSpec), tc.tenantID) + hook, err := security.UserAuthCertHook( + tc.insecure, + makeFakeTLSState(t, tc.tlsSpec), + tc.tenantID, + nil, /* certManager */ + nil, /* cache */ + ) if (err == nil) != tc.buildHookSuccess { t.Fatalf("expected success=%t, got err=%v", tc.buildHookSuccess, err) } diff --git a/pkg/security/cert_expiry_cache.go b/pkg/security/cert_expiry_cache.go new file mode 100644 index 000000000000..07a5f8a2c618 --- /dev/null +++ b/pkg/security/cert_expiry_cache.go @@ -0,0 +1,227 @@ +// Copyright 2023 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package security + +import ( + "context" + math_rand "math/rand" + "time" + "unsafe" + + "github.com/cockroachdb/cockroach/pkg/settings" + "github.com/cockroachdb/cockroach/pkg/settings/cluster" + "github.com/cockroachdb/cockroach/pkg/util/cache" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/cockroach/pkg/util/metric/aggmetric" + "github.com/cockroachdb/cockroach/pkg/util/mon" + "github.com/cockroachdb/cockroach/pkg/util/stop" + "github.com/cockroachdb/cockroach/pkg/util/syncutil" + "github.com/cockroachdb/cockroach/pkg/util/timeutil" +) + +// CacheCapacityMax is set arbitrarily high; configurable later if needed. +const CacheCapacityMax = 65000 + +// ClientCertExpirationCacheCapacity is the cluster setting that controls the +// maximum number of client cert expirations in the cache. +var ClientCertExpirationCacheCapacity = settings.RegisterIntSetting( + settings.TenantWritable, + "server.client_cert_expiration_cache.capacity", + "the maximum number of client cert expirations stored", + 1000, +).WithPublic() + +// ClientCertExpirationCache contains a cache of gauge objects keyed by +// SQL username strings. It is a FIFO cache that stores gauges valued by +// minimum expiration of the client certs seen (per user). +type ClientCertExpirationCache struct { + mu struct { + // NB: Cannot be a RWMutex for Get because UnorderedCache.Get manipulates + // an internal hashmap. + syncutil.Mutex + cache *cache.UnorderedCache + acc mon.BoundAccount + } + settings *cluster.Settings + stopper *stop.Stopper + mon *mon.BytesMonitor + timeSrc interface{} +} + +func NewClientCertExpirationCache( + ctx context.Context, + st *cluster.Settings, + stopper *stop.Stopper, + timeSrc timeutil.TimeSource, + parentMon *mon.BytesMonitor, +) *ClientCertExpirationCache { + c := &ClientCertExpirationCache{settings: st} + c.stopper = stopper + + switch timeSrc := timeSrc.(type) { + case *timeutil.DefaultTimeSource, *timeutil.ManualTime: + c.timeSrc = timeSrc + default: + c.timeSrc = &timeutil.DefaultTimeSource{} + } + + c.mu.cache = cache.NewUnorderedCache(cache.Config{ + Policy: cache.CacheFIFO, + ShouldEvict: func(size int, _, value interface{}) bool { + var capacity int64 + settingCapacity := ClientCertExpirationCacheCapacity.Get(&st.SV) + if settingCapacity < CacheCapacityMax { + capacity = settingCapacity + } else { + capacity = CacheCapacityMax + } + return int64(size) > capacity + }, + OnEvictedEntry: func(entry *cache.Entry) { + gauge := entry.Value.(*aggmetric.Gauge) + gauge.Unlink() + c.mu.acc.Shrink(ctx, int64(unsafe.Sizeof(*gauge))) + }, + }) + c.mon = mon.NewMonitorInheritWithLimit( + "client-expiration-cache", 0 /* limit */, parentMon, + ) + c.mu.acc = c.mon.MakeBoundAccount() + c.mon.StartNoReserved(ctx, parentMon) + + // Begin an async task to periodically evict entries associated with + // expiration values that are in the past. + if err := c.startPurgePastExpirations(ctx); err != nil { + log.Ops.Warningf( + ctx, "failed to initiate periodic purge of expiration cache entries: %v", err, + ) + } + + return c +} + +// Get retrieves the cert expiration for the given username, if it exists. +// An expiration of 0 indicates an entry was not found. +func (c *ClientCertExpirationCache) Get(key string) (int64, bool) { + c.mu.Lock() + defer c.mu.Unlock() + value, ok := c.mu.cache.Get(key) + if !ok { + return 0, ok + } + // If the expiration has already been reached, remove the entry and indicate + // that the entry was not found. + gauge := value.(*aggmetric.Gauge) + if gauge.Value() < c.timeNow() { + c.mu.cache.Del(key) + return 0, false + } + return gauge.Value(), ok +} + +// MaybeUpsert may update or insert a client cert expiration gauge for a +// particular user into the cache. An update is contingent on whether the +// old expiration is after the new expiration. This ensures that the cache +// maintains the minimum expiration for each user. +func (c *ClientCertExpirationCache) MaybeUpsert( + ctx context.Context, key string, newExpiry int64, parentGauge *aggmetric.AggGauge, +) { + c.mu.Lock() + defer c.mu.Unlock() + + value, ok := c.mu.cache.Get(key) + if !ok { + err := c.mu.acc.Grow(ctx, int64(unsafe.Sizeof(aggmetric.Gauge{}))) + if err == nil { + // Only create new gauges for expirations in the future. + if newExpiry > c.timeNow() { + gauge := parentGauge.AddChild(key) + gauge.Update(newExpiry) + c.mu.cache.Add(key, gauge) + } + } else { + log.Ops.Warningf(ctx, "no memory available to cache cert expiry: %v", err) + } + } else if gauge := value.(*aggmetric.Gauge); newExpiry < gauge.Value() || gauge.Value() == 0 { + gauge.Update(newExpiry) + } +} + +func (c *ClientCertExpirationCache) Clear() { + c.mu.Lock() + defer c.mu.Unlock() + c.mu.cache.Clear() +} + +// Len returns the number of cert expirations in the cache. +func (c *ClientCertExpirationCache) Len() int { + c.mu.Lock() + defer c.mu.Unlock() + return c.mu.cache.Len() +} + +// timeNow returns the current time depending on the time source of the cache. +func (c *ClientCertExpirationCache) timeNow() int64 { + if timeSrc, ok := c.timeSrc.(timeutil.TimeSource); ok { + return timeSrc.Now().Unix() + } + return timeutil.Now().Unix() +} + +// startPurgePastExpirations runs an infinite loop in a goroutine which +// regularly evicts entries associated with expiration values that have already +// passed. +func (c *ClientCertExpirationCache) startPurgePastExpirations(ctx context.Context) error { + return c.stopper.RunAsyncTask(ctx, "purge-cert-expiry-cache", func(context.Context) { + const period = time.Hour + + timer := timeutil.NewTimer() + defer timer.Stop() + + timer.Reset(jitteredInterval(period)) + for ; ; timer.Reset(period) { + select { + case <-timer.C: + timer.Read = true + c.PurgePastExpirations() + case <-c.stopper.ShouldQuiesce(): + return + case <-ctx.Done(): + return + } + } + }, + ) +} + +// PurgePastExpirations removes entries associated with expiration values that +// have already passed. This helps ensure that the cache contains gauges +// with expiration values in the future only. +func (c *ClientCertExpirationCache) PurgePastExpirations() { + c.mu.Lock() + defer c.mu.Unlock() + var deleteEntryKeys []interface{} + c.mu.cache.Do(func(entry *cache.Entry) { + gauge := entry.Value.(*aggmetric.Gauge) + if gauge.Value() <= c.timeNow() { + deleteEntryKeys = append(deleteEntryKeys, entry.Key) + } + }) + for _, key := range deleteEntryKeys { + c.mu.cache.Del(key) + } +} + +// jitteredInterval returns a randomly jittered (+/-25%) duration +// from the interval. +func jitteredInterval(interval time.Duration) time.Duration { + return time.Duration(float64(interval) * (0.75 + 0.5*math_rand.Float64())) +} diff --git a/pkg/security/cert_expiry_cache_test.go b/pkg/security/cert_expiry_cache_test.go new file mode 100644 index 000000000000..c899a6a328ab --- /dev/null +++ b/pkg/security/cert_expiry_cache_test.go @@ -0,0 +1,232 @@ +// Copyright 2023 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package security_test + +import ( + "context" + "math" + "sync" + "testing" + + "github.com/cockroachdb/cockroach/pkg/security" + "github.com/cockroachdb/cockroach/pkg/settings/cluster" + "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "github.com/cockroachdb/cockroach/pkg/util/metric" + "github.com/cockroachdb/cockroach/pkg/util/metric/aggmetric" + "github.com/cockroachdb/cockroach/pkg/util/mon" + "github.com/cockroachdb/cockroach/pkg/util/stop" + "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/stretchr/testify/require" +) + +func TestEntryCache(t *testing.T) { + defer leaktest.AfterTest(t)() + + const ( + fooUser = "foo" + barUser = "bar" + blahUser = "blah" + fakeUser = "fake" + + laterExpiration = int64(1684359292) + closerExpiration = int64(1584359292) + ) + + ctx := context.Background() + + // Create a cache with a capacity of 3. + cache, metric := newCache( + ctx, + &cluster.Settings{}, + 3, /* capacity */ + timeutil.NewManualTime(timeutil.Unix(0, 123)), + ) + require.Equal(t, 0, cache.Len()) + + // Verify insert. + cache.MaybeUpsert(ctx, fooUser, laterExpiration, metric) + require.Equal(t, 1, cache.Len()) + + // Verify update. + cache.MaybeUpsert(ctx, fooUser, closerExpiration, metric) + require.Equal(t, 1, cache.Len()) + + // Verify retrieval. + expiration, found := cache.Get(fooUser) + require.Equal(t, true, found) + require.Equal(t, closerExpiration, expiration) + + // Verify the cache retains the minimum expiration for a user, assuming no + // eviction. + cache.MaybeUpsert(ctx, barUser, closerExpiration, metric) + require.Equal(t, 2, cache.Len()) + cache.MaybeUpsert(ctx, barUser, laterExpiration, metric) + require.Equal(t, 2, cache.Len()) + expiration, found = cache.Get(barUser) + require.Equal(t, true, found) + require.Equal(t, closerExpiration, expiration) + + // Verify indication of absence for non-existent values. + expiration, found = cache.Get(fakeUser) + require.Equal(t, false, found) + require.Equal(t, int64(0), expiration) + + // Verify eviction when the capacity is exceeded. + cache.MaybeUpsert(ctx, blahUser, laterExpiration, metric) + require.Equal(t, 3, cache.Len()) + cache.MaybeUpsert(ctx, fakeUser, closerExpiration, metric) + require.Equal(t, 3, cache.Len()) + _, found = cache.Get(fooUser) + require.Equal(t, false, found) + _, found = cache.Get(barUser) + require.Equal(t, true, found) + + // Verify previous entries can be inserted after the cache is cleared. + cache.Clear() + require.Equal(t, 0, cache.Len()) + _, found = cache.Get(fooUser) + require.Equal(t, false, found) + _, found = cache.Get(barUser) + require.Equal(t, false, found) + cache.MaybeUpsert(ctx, fooUser, laterExpiration, metric) + require.Equal(t, 1, cache.Len()) + cache.MaybeUpsert(ctx, barUser, laterExpiration, metric) + require.Equal(t, 2, cache.Len()) + expiration, found = cache.Get(fooUser) + require.Equal(t, true, found) + require.Equal(t, laterExpiration, expiration) + expiration, found = cache.Get(barUser) + require.Equal(t, true, found) + require.Equal(t, laterExpiration, expiration) + + // Verify expirations in the past cannot be inserted into the cache. + cache.Clear() + cache.MaybeUpsert(ctx, fooUser, int64(0), metric) + require.Equal(t, 0, cache.Len()) +} + +func TestPurgePastEntries(t *testing.T) { + defer leaktest.AfterTest(t)() + + const ( + fooUser = "foo" + barUser = "bar" + blahUser = "blah" + bazUser = "baz" + + pastExpiration1 = int64(1000000000) + pastExpiration2 = int64(2000000000) + futureExpiration = int64(3000000000) + ) + + ctx := context.Background() + + // Create a cache with a capacity of 3. + clock := timeutil.NewManualTime(timeutil.Unix(0, 123)) + cache, metric := newCache(ctx, &cluster.Settings{}, 4 /* capacity */, clock) + + // Insert entries that we expect to be cleaned up after advancing in time. + cache.MaybeUpsert(ctx, fooUser, pastExpiration1, metric) + cache.MaybeUpsert(ctx, barUser, pastExpiration2, metric) + cache.MaybeUpsert(ctx, blahUser, pastExpiration2, metric) + // Insert an entry that should NOT be removed after advancing in time + // because it is still in the future. + cache.MaybeUpsert(ctx, bazUser, futureExpiration, metric) + require.Equal(t, 4, cache.Len()) + + // Advance time so that expirations have been reached already. + clock.AdvanceTo(timeutil.Unix(2000000000, 123)) + + // Verify an expiration from the past cannot be retrieved. Confirm it has + // been removed after the attempt as well. + _, found := cache.Get(fooUser) + require.Equal(t, false, found) + require.Equal(t, 3, cache.Len()) + + // Verify that when the cache gets cleaned of the past expirations. + // Confirm that expirations in the future do not get removed. + cache.PurgePastExpirations() + require.Equal(t, 1, cache.Len()) + _, found = cache.Get(bazUser) + require.Equal(t, true, found) +} + +// TestConcurrentUpdates ensures that concurrent updates do not race with each +// other. +func TestConcurrentUpdates(t *testing.T) { + defer leaktest.AfterTest(t)() + + ctx := context.Background() + st := &cluster.Settings{} + + // Create a cache with a large capacity. + cache, metric := newCache( + ctx, + st, + 10000, /* capacity */ + timeutil.NewManualTime(timeutil.Unix(0, 123)), + ) + + var ( + user = "testUser" + expiration = int64(1684359292) + ) + + // NB: N is chosen based on the race detector's limit of 8128 goroutines. + const N = 8000 + var wg sync.WaitGroup + wg.Add(N) + for i := 0; i < N; i++ { + go func(i int) { + if i%2 == 1 { + cache.MaybeUpsert(ctx, user, expiration, metric) + } else { + cache.Clear() + } + wg.Done() + }(i) + } + wg.Wait() + cache.Clear() +} + +func BenchmarkCertExpirationCacheInsert(b *testing.B) { + ctx := context.Background() + st := &cluster.Settings{} + clock := timeutil.NewManualTime(timeutil.Unix(0, 123)) + cache, metric := newCache(ctx, st, 1000 /* capacity */, clock) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + cache.MaybeUpsert(ctx, "foo", clock.Now().Unix(), metric) + cache.MaybeUpsert(ctx, "bar", clock.Now().Unix(), metric) + cache.MaybeUpsert(ctx, "blah", clock.Now().Unix(), metric) + } +} + +func newCache( + ctx context.Context, st *cluster.Settings, capacity int, clock *timeutil.ManualTime, +) (*security.ClientCertExpirationCache, *aggmetric.AggGauge) { + stopper := stop.NewStopper() + defer stopper.Stop(ctx) + security.ClientCertExpirationCacheCapacity.Override(ctx, &st.SV, int64(capacity)) + parentMon := mon.NewUnlimitedMonitor( + ctx, + "test", /* name */ + mon.MemoryResource, + nil, /* currCount */ + nil, /* maxHist */ + math.MaxInt64, + st, + ) + cache := security.NewClientCertExpirationCache(ctx, st, stopper, clock, parentMon) + return cache, aggmetric.MakeBuilder(security.SQLUserLabel).Gauge(metric.Metadata{}) +} diff --git a/pkg/security/certificate_manager.go b/pkg/security/certificate_manager.go index d63cbb9a06de..f6ef0aa3d970 100644 --- a/pkg/security/certificate_manager.go +++ b/pkg/security/certificate_manager.go @@ -26,58 +26,6 @@ import ( "github.com/cockroachdb/errors" ) -var ( - metaCAExpiration = metric.Metadata{ - Name: "security.certificate.expiration.ca", - Help: "Expiration for the CA certificate. 0 means no certificate or error.", - Measurement: "Certificate Expiration", - Unit: metric.Unit_TIMESTAMP_SEC, - } - metaClientCAExpiration = metric.Metadata{ - Name: "security.certificate.expiration.client-ca", - Help: "Expiration for the client CA certificate. 0 means no certificate or error.", - Measurement: "Certificate Expiration", - Unit: metric.Unit_TIMESTAMP_SEC, - } - metaUICAExpiration = metric.Metadata{ - Name: "security.certificate.expiration.ui-ca", - Help: "Expiration for the UI CA certificate. 0 means no certificate or error.", - Measurement: "Certificate Expiration", - Unit: metric.Unit_TIMESTAMP_SEC, - } - metaNodeExpiration = metric.Metadata{ - Name: "security.certificate.expiration.node", - Help: "Expiration for the node certificate. 0 means no certificate or error.", - Measurement: "Certificate Expiration", - Unit: metric.Unit_TIMESTAMP_SEC, - } - metaNodeClientExpiration = metric.Metadata{ - Name: "security.certificate.expiration.node-client", - Help: "Expiration for the node's client certificate. 0 means no certificate or error.", - Measurement: "Certificate Expiration", - Unit: metric.Unit_TIMESTAMP_SEC, - } - metaUIExpiration = metric.Metadata{ - Name: "security.certificate.expiration.ui", - Help: "Expiration for the UI certificate. 0 means no certificate or error.", - Measurement: "Certificate Expiration", - Unit: metric.Unit_TIMESTAMP_SEC, - } - - metaTenantCAExpiration = metric.Metadata{ - Name: "security.certificate.expiration.ca-client-tenant", - Help: "Expiration for the Tenant Client CA certificate. 0 means no certificate or error.", - Measurement: "Certificate Expiration", - Unit: metric.Unit_TIMESTAMP_SEC, - } - metaTenantExpiration = metric.Metadata{ - Name: "security.certificate.expiration.client-tenant", - Help: "Expiration for the Tenant Client certificate. 0 means no certificate or error.", - Measurement: "Certificate Expiration", - Unit: metric.Unit_TIMESTAMP_SEC, - } -) - // CertificateManager lives for the duration of the process and manages certificates and keys. // It reloads all certificates when triggered and construct tls.Config objects for // servers or clients. @@ -106,7 +54,7 @@ type CertificateManager struct { // The metrics struct is initialized at init time and metrics do their // own locking. - certMetrics CertificateMetrics + certMetrics Metrics // mu protects all remaining fields. mu syncutil.RWMutex @@ -138,20 +86,6 @@ type CertificateManager struct { tenantConfig *tls.Config } -// CertificateMetrics holds metrics about the various certificates. -// These are initialized when the certificate manager is created and updated -// on reload. -type CertificateMetrics struct { - CAExpiration *metric.Gauge - ClientCAExpiration *metric.Gauge - UICAExpiration *metric.Gauge - NodeExpiration *metric.Gauge - NodeClientExpiration *metric.Gauge - UIExpiration *metric.Gauge - TenantCAExpiration *metric.Gauge - TenantExpiration *metric.Gauge -} - func makeCertificateManager( certsDir string, tlsSettings TLSSettings, opts ...Option, ) *CertificateManager { @@ -164,16 +98,7 @@ func makeCertificateManager( Locator: certnames.MakeLocator(certsDir), tenantIdentifier: o.tenantIdentifier, tlsSettings: tlsSettings, - certMetrics: CertificateMetrics{ - CAExpiration: metric.NewGauge(metaCAExpiration), - ClientCAExpiration: metric.NewGauge(metaClientCAExpiration), - UICAExpiration: metric.NewGauge(metaUICAExpiration), - NodeExpiration: metric.NewGauge(metaNodeExpiration), - NodeClientExpiration: metric.NewGauge(metaNodeClientExpiration), - UIExpiration: metric.NewGauge(metaUIExpiration), - TenantCAExpiration: metric.NewGauge(metaTenantCAExpiration), - TenantExpiration: metric.NewGauge(metaTenantExpiration), - }, + certMetrics: makeMetrics(), } } @@ -225,7 +150,7 @@ func (cm *CertificateManager) IsForTenant() bool { } // Metrics returns the metrics struct. -func (cm *CertificateManager) Metrics() CertificateMetrics { +func (cm *CertificateManager) Metrics() Metrics { return cm.certMetrics } diff --git a/pkg/security/certificate_metrics.go b/pkg/security/certificate_metrics.go new file mode 100644 index 000000000000..7388b810c21c --- /dev/null +++ b/pkg/security/certificate_metrics.go @@ -0,0 +1,119 @@ +// Copyright 2023 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package security + +import ( + "github.com/cockroachdb/cockroach/pkg/util/metric" + "github.com/cockroachdb/cockroach/pkg/util/metric/aggmetric" +) + +const SQLUserLabel = "sql_user" + +// Metrics is a metric.Struct for certificates. +type Metrics struct { + // CA certificate expirations. + CAExpiration *metric.Gauge + ClientCAExpiration *metric.Gauge + UICAExpiration *metric.Gauge + TenantCAExpiration *metric.Gauge + + // Certificate expirations. + NodeExpiration *metric.Gauge + NodeClientExpiration *metric.Gauge + UIExpiration *metric.Gauge + TenantExpiration *metric.Gauge + // ClientExpiration is an aggregate metric, containing child metrics for all + // users that have done cert auth with this node. + // The children are labeled by SQL user. + // The top-level aggregated value for this metric is not meaningful + // (it sums up all the minimum expirations of all users). + ClientExpiration *aggmetric.AggGauge +} + +var _ metric.Struct = (*Metrics)(nil) + +// MetricStruct indicates that Metrics is a metric.Struct. +func (m *Metrics) MetricStruct() {} + +var ( + metaCAExpiration = metric.Metadata{ + Name: "security.certificate.expiration.ca", + Help: "Expiration for the CA certificate. 0 means no certificate or error.", + Measurement: "Certificate Expiration", + Unit: metric.Unit_TIMESTAMP_SEC, + } + metaClientCAExpiration = metric.Metadata{ + Name: "security.certificate.expiration.client-ca", + Help: "Expiration for the client CA certificate. 0 means no certificate or error.", + Measurement: "Certificate Expiration", + Unit: metric.Unit_TIMESTAMP_SEC, + } + metaClientExpiration = metric.Metadata{ + Name: "security.certificate.expiration.client", + Help: "Minimum expiration for client certificates, labeled by SQL user. 0 means no " + + "certificate or error. ", + Measurement: "Certificate Expiration", + Unit: metric.Unit_TIMESTAMP_SEC, + } + metaUICAExpiration = metric.Metadata{ + Name: "security.certificate.expiration.ui-ca", + Help: "Expiration for the UI CA certificate. 0 means no certificate or error.", + Measurement: "Certificate Expiration", + Unit: metric.Unit_TIMESTAMP_SEC, + } + metaNodeExpiration = metric.Metadata{ + Name: "security.certificate.expiration.node", + Help: "Expiration for the node certificate. 0 means no certificate or error.", + Measurement: "Certificate Expiration", + Unit: metric.Unit_TIMESTAMP_SEC, + } + metaNodeClientExpiration = metric.Metadata{ + Name: "security.certificate.expiration.node-client", + Help: "Expiration for the node's client certificate. 0 means no certificate or error.", + Measurement: "Certificate Expiration", + Unit: metric.Unit_TIMESTAMP_SEC, + } + metaUIExpiration = metric.Metadata{ + Name: "security.certificate.expiration.ui", + Help: "Expiration for the UI certificate. 0 means no certificate or error.", + Measurement: "Certificate Expiration", + Unit: metric.Unit_TIMESTAMP_SEC, + } + + metaTenantCAExpiration = metric.Metadata{ + Name: "security.certificate.expiration.ca-client-tenant", + Help: "Expiration for the Tenant Client CA certificate. 0 means no certificate or error.", + Measurement: "Certificate Expiration", + Unit: metric.Unit_TIMESTAMP_SEC, + } + metaTenantExpiration = metric.Metadata{ + Name: "security.certificate.expiration.client-tenant", + Help: "Expiration for the Tenant Client certificate. 0 means no certificate or error.", + Measurement: "Certificate Expiration", + Unit: metric.Unit_TIMESTAMP_SEC, + } +) + +func makeMetrics() Metrics { + b := aggmetric.MakeBuilder(SQLUserLabel) + m := Metrics{ + CAExpiration: metric.NewGauge(metaCAExpiration), + ClientCAExpiration: metric.NewGauge(metaClientCAExpiration), + TenantCAExpiration: metric.NewGauge(metaTenantCAExpiration), + UICAExpiration: metric.NewGauge(metaUICAExpiration), + ClientExpiration: b.Gauge(metaClientExpiration), + TenantExpiration: metric.NewGauge(metaTenantExpiration), + NodeExpiration: metric.NewGauge(metaNodeExpiration), + NodeClientExpiration: metric.NewGauge(metaNodeClientExpiration), + UIExpiration: metric.NewGauge(metaUIExpiration), + } + return m +} diff --git a/pkg/server/server_sql.go b/pkg/server/server_sql.go index 54aa2f81576f..de5599c26577 100644 --- a/pkg/server/server_sql.go +++ b/pkg/server/server_sql.go @@ -48,6 +48,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/rpc" "github.com/cockroachdb/cockroach/pkg/rpc/nodedialer" "github.com/cockroachdb/cockroach/pkg/scheduledjobs" + "github.com/cockroachdb/cockroach/pkg/security" "github.com/cockroachdb/cockroach/pkg/security/clientsecopts" "github.com/cockroachdb/cockroach/pkg/security/username" "github.com/cockroachdb/cockroach/pkg/server/autoconfig" @@ -945,35 +946,42 @@ func newSQLServer(ctx context.Context, cfg sqlServerArgs) (*SQLServer, error) { storageEngineClient := kvserver.NewStorageEngineClient(cfg.nodeDialer) *execCfg = sql.ExecutorConfig{ - Settings: cfg.Settings, - NodeInfo: nodeInfo, - Codec: codec, - DefaultZoneConfig: &cfg.DefaultZoneConfig, - Locality: cfg.Locality, - AmbientCtx: cfg.AmbientCtx, - DB: cfg.db, - Gossip: cfg.gossip, - NodeLiveness: cfg.nodeLiveness, - SystemConfig: cfg.systemConfigWatcher, - MetricsRecorder: cfg.recorder, - DistSender: cfg.distSender, - RPCContext: cfg.rpcContext, - LeaseManager: leaseMgr, - TenantStatusServer: cfg.tenantStatusServer, - Clock: cfg.clock, - DistSQLSrv: distSQLServer, - NodesStatusServer: cfg.nodesStatusServer, - SQLStatusServer: cfg.sqlStatusServer, - SessionRegistry: cfg.sessionRegistry, - ClosedSessionCache: cfg.closedSessionCache, - ContentionRegistry: contentionRegistry, - SQLLiveness: cfg.sqlLivenessProvider, - JobRegistry: jobRegistry, - VirtualSchemas: virtualSchemas, - HistogramWindowInterval: cfg.HistogramWindowInterval(), - RangeDescriptorCache: cfg.distSender.RangeDescriptorCache(), - RoleMemberCache: sql.NewMembershipCache(serverCacheMemoryMonitor.MakeBoundAccount(), cfg.stopper), - SessionInitCache: sessioninit.NewCache(serverCacheMemoryMonitor.MakeBoundAccount(), cfg.stopper), + Settings: cfg.Settings, + NodeInfo: nodeInfo, + Codec: codec, + DefaultZoneConfig: &cfg.DefaultZoneConfig, + Locality: cfg.Locality, + AmbientCtx: cfg.AmbientCtx, + DB: cfg.db, + Gossip: cfg.gossip, + NodeLiveness: cfg.nodeLiveness, + SystemConfig: cfg.systemConfigWatcher, + MetricsRecorder: cfg.recorder, + DistSender: cfg.distSender, + RPCContext: cfg.rpcContext, + LeaseManager: leaseMgr, + TenantStatusServer: cfg.tenantStatusServer, + Clock: cfg.clock, + DistSQLSrv: distSQLServer, + NodesStatusServer: cfg.nodesStatusServer, + SQLStatusServer: cfg.sqlStatusServer, + SessionRegistry: cfg.sessionRegistry, + ClosedSessionCache: cfg.closedSessionCache, + ContentionRegistry: contentionRegistry, + SQLLiveness: cfg.sqlLivenessProvider, + JobRegistry: jobRegistry, + VirtualSchemas: virtualSchemas, + HistogramWindowInterval: cfg.HistogramWindowInterval(), + RangeDescriptorCache: cfg.distSender.RangeDescriptorCache(), + RoleMemberCache: sql.NewMembershipCache( + serverCacheMemoryMonitor.MakeBoundAccount(), cfg.stopper, + ), + SessionInitCache: sessioninit.NewCache( + serverCacheMemoryMonitor.MakeBoundAccount(), cfg.stopper, + ), + ClientCertExpirationCache: security.NewClientCertExpirationCache( + ctx, cfg.Settings, cfg.stopper, &timeutil.DefaultTimeSource{}, rootSQLMemoryMonitor, + ), RootMemoryMonitor: rootSQLMemoryMonitor, TestingKnobs: sqlExecutorTestingKnobs, CompactEngineSpanFunc: storageEngineClient.CompactEngineSpan, diff --git a/pkg/sql/exec_util.go b/pkg/sql/exec_util.go index c1dcbbd3e084..6933edf10380 100644 --- a/pkg/sql/exec_util.go +++ b/pkg/sql/exec_util.go @@ -49,6 +49,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/obs" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/rpc" + "github.com/cockroachdb/cockroach/pkg/security" "github.com/cockroachdb/cockroach/pkg/security/username" "github.com/cockroachdb/cockroach/pkg/server/autoconfig/acprovider" "github.com/cockroachdb/cockroach/pkg/server/pgurl" @@ -1309,6 +1310,9 @@ type ExecutorConfig struct { // Role membership cache. RoleMemberCache *MembershipCache + // Client cert expiration cache. + ClientCertExpirationCache *security.ClientCertExpirationCache + // SessionInitCache cache; contains information used during authentication // and per-role default settings. SessionInitCache *sessioninit.Cache diff --git a/pkg/sql/pgwire/auth_methods.go b/pkg/sql/pgwire/auth_methods.go index 4065737bd05d..b18f2e25aeb1 100644 --- a/pkg/sql/pgwire/auth_methods.go +++ b/pkg/sql/pgwire/auth_methods.go @@ -427,7 +427,19 @@ func authCert( tlsState.PeerCertificates[0].Subject.CommonName = tree.Name( tlsState.PeerCertificates[0].Subject.CommonName, ).Normalize() - hook, err := security.UserAuthCertHook(false /*insecure*/, &tlsState, execCfg.RPCContext.TenantID) + + cm, err := execCfg.RPCContext.SecurityContext.GetCertificateManager() + if err != nil { + log.Ops.Warningf(ctx, "failed to get cert manager info: %v", err) + } + + hook, err := security.UserAuthCertHook( + false, /*insecure*/ + &tlsState, + execCfg.RPCContext.TenantID, + cm, + execCfg.ClientCertExpirationCache, + ) if err != nil { return err }