diff --git a/domain/domain.go b/domain/domain.go index 58011b91efb8f..cb03690cd11b8 100644 --- a/domain/domain.go +++ b/domain/domain.go @@ -164,6 +164,13 @@ func (do *Domain) EtcdClient() *clientv3.Client { return do.etcdClient } +var ( + loadSchemaCounterSnapshot = metrics.LoadSchemaCounter.WithLabelValues("snapshot") + loadSchemaDurationTotal = metrics.LoadSchemaDuration.WithLabelValues("total") + loadSchemaDurationLoadDiff = metrics.LoadSchemaDuration.WithLabelValues("load-diff") + loadSchemaDurationLoadAll = metrics.LoadSchemaDuration.WithLabelValues("load-all") +) + // loadInfoSchema loads infoschema at startTS. // It returns: // 1. the needed infoschema @@ -172,6 +179,10 @@ func (do *Domain) EtcdClient() *clientv3.Client { // 4. the changed table IDs if it is not full load // 5. an error if any func (do *Domain) loadInfoSchema(startTS uint64) (infoschema.InfoSchema, bool, int64, *transaction.RelatedSchemaChange, error) { + beginTime := time.Now() + defer func() { + loadSchemaDurationTotal.Observe(time.Since(beginTime).Seconds()) + }() snapshot := do.store.GetSnapshot(kv.NewVersion(startTS)) m := meta.NewSnapshotMeta(snapshot) neededSchemaVersion, err := m.GetSchemaVersionWithNonEmptyDiff() @@ -207,6 +218,7 @@ func (do *Domain) loadInfoSchema(startTS uint64) (infoschema.InfoSchema, bool, i if currentSchemaVersion != 0 && neededSchemaVersion > currentSchemaVersion && neededSchemaVersion-currentSchemaVersion < 100 { is, relatedChanges, err := do.tryLoadSchemaDiffs(m, currentSchemaVersion, neededSchemaVersion) if err == nil { + loadSchemaDurationLoadDiff.Observe(time.Since(startTime).Seconds()) do.infoCache.Insert(is, uint64(schemaTs)) logutil.BgLogger().Info("diff load InfoSchema success", zap.Int64("currentSchemaVersion", currentSchemaVersion), @@ -234,6 +246,7 @@ func (do *Domain) loadInfoSchema(startTS uint64) (infoschema.InfoSchema, bool, i if err != nil { return nil, false, currentSchemaVersion, nil, err } + loadSchemaDurationLoadAll.Observe(time.Since(startTime).Seconds()) logutil.BgLogger().Info("full load InfoSchema success", zap.Int64("currentSchemaVersion", currentSchemaVersion), zap.Int64("neededSchemaVersion", neededSchemaVersion), @@ -415,6 +428,7 @@ func (do *Domain) GetSnapshotInfoSchema(snapshotTS uint64) (infoschema.InfoSchem return is, nil } is, _, _, _, err := do.loadInfoSchema(snapshotTS) + loadSchemaCounterSnapshot.Inc() return is, err } @@ -496,7 +510,6 @@ func (do *Domain) Reload() error { } is, hitCache, oldSchemaVersion, changes, err := do.loadInfoSchema(ver.Ver) - metrics.LoadSchemaDuration.Observe(time.Since(startTime).Seconds()) if err != nil { metrics.LoadSchemaCounter.WithLabelValues("failed").Inc() return err @@ -926,7 +939,7 @@ func NewDomain(store kv.Storage, ddlLease time.Duration, statsLease time.Duratio exit: make(chan struct{}), sysSessionPool: newSessionPool(capacity, factory), statsLease: statsLease, - infoCache: infoschema.NewCache(1024), + infoCache: infoschema.NewCache(int(variable.SchemaVersionCacheLimit.Load())), slowQuery: newTopNSlowQueries(30, time.Hour*24*7, 500), indexUsageSyncLease: idxUsageSyncLease, dumpFileGcChecker: &dumpFileGcChecker{gcLease: dumpFileGcLease, paths: []string{GetPlanReplayerDirName(), GetOptimizerTraceDirName()}}, diff --git a/domain/sysvar_cache.go b/domain/sysvar_cache.go index 370260a67c02a..9c50c779fb461 100644 --- a/domain/sysvar_cache.go +++ b/domain/sysvar_cache.go @@ -158,5 +158,6 @@ func (do *Domain) rebuildSysVarCache(ctx sessionctx.Context) error { defer do.sysVarCache.Unlock() do.sysVarCache.session = newSessionCache do.sysVarCache.global = newGlobalCache + do.infoCache.ReSize(int(variable.SchemaVersionCacheLimit.Load())) return nil } diff --git a/executor/set_test.go b/executor/set_test.go index 734fdab8750fe..850042f6dc431 100644 --- a/executor/set_test.go +++ b/executor/set_test.go @@ -870,6 +870,23 @@ func TestSetVar(t *testing.T) { require.Equal(t, uint64(2), tk.Session().GetSessionVars().CDCWriteSource) tk.MustExec("set @@session.tidb_cdc_write_source = 0") require.Equal(t, uint64(0), tk.Session().GetSessionVars().CDCWriteSource) + + // test tidb_schema_version_cache_limit + tk.MustQuery("select @@global.tidb_schema_version_cache_limit").Check(testkit.Rows("64")) + tk.MustExec("set @@global.tidb_schema_version_cache_limit=64;") + tk.MustQuery("select @@global.tidb_schema_version_cache_limit").Check(testkit.Rows("64")) + tk.MustExec("set @@global.tidb_schema_version_cache_limit=2;") + tk.MustQuery("select @@global.tidb_schema_version_cache_limit").Check(testkit.Rows("2")) + tk.MustExec("set @@global.tidb_schema_version_cache_limit=256;") + tk.MustQuery("SHOW WARNINGS").Check(testkit.Rows("Warning 1292 Truncated incorrect tidb_schema_version_cache_limit value: '256'")) + tk.MustQuery("select @@global.tidb_schema_version_cache_limit").Check(testkit.Rows("255")) + tk.MustExec("set @@global.tidb_schema_version_cache_limit=0;") + tk.MustQuery("SHOW WARNINGS").Check(testkit.Rows("Warning 1292 Truncated incorrect tidb_schema_version_cache_limit value: '0'")) + tk.MustQuery("select @@global.tidb_schema_version_cache_limit").Check(testkit.Rows("2")) + tk.MustGetErrMsg("set @@global.tidb_schema_version_cache_limit='x';", "[variable:1232]Incorrect argument type to variable 'tidb_schema_version_cache_limit'") + tk.MustQuery("select @@global.tidb_schema_version_cache_limit").Check(testkit.Rows("2")) + tk.MustExec("set @@global.tidb_schema_version_cache_limit=64;") + tk.MustQuery("select @@global.tidb_schema_version_cache_limit").Check(testkit.Rows("64")) } func TestGetSetNoopVars(t *testing.T) { diff --git a/infoschema/cache.go b/infoschema/cache.go index eb9fbc6c4857b..8446f240431d7 100644 --- a/infoschema/cache.go +++ b/infoschema/cache.go @@ -54,6 +54,30 @@ func NewCache(capacity int) *InfoCache { } } +// ReSize re-size the cache. +func (h *InfoCache) ReSize(capacity int) { + h.mu.Lock() + defer h.mu.Unlock() + if cap(h.cache) == capacity { + return + } + oldCache := h.cache + h.cache = make([]schemaAndTimestamp, 0, capacity) + for i, v := range oldCache { + if i >= capacity { + break + } + h.cache = append(h.cache, v) + } +} + +// Size returns the size of the cache, export for test. +func (h *InfoCache) Size() int { + h.mu.Lock() + defer h.mu.Unlock() + return len(h.cache) +} + // Reset resets the cache. func (h *InfoCache) Reset(capacity int) { h.mu.Lock() diff --git a/infoschema/cache_test.go b/infoschema/cache_test.go index d55a7a2281cbc..5d6f6584d9c9f 100644 --- a/infoschema/cache_test.go +++ b/infoschema/cache_test.go @@ -178,3 +178,37 @@ func TestGetByTimestamp(t *testing.T) { require.Equal(t, 3, ic.Len()) } + +func TestReSize(t *testing.T) { + ic := infoschema.NewCache(2) + require.NotNil(t, ic) + is1 := infoschema.MockInfoSchemaWithSchemaVer(nil, 1) + ic.Insert(is1, 1) + is2 := infoschema.MockInfoSchemaWithSchemaVer(nil, 2) + ic.Insert(is2, 2) + + ic.ReSize(3) + require.Equal(t, 2, ic.Size()) + require.Equal(t, is1, ic.GetByVersion(1)) + require.Equal(t, is2, ic.GetByVersion(2)) + is3 := infoschema.MockInfoSchemaWithSchemaVer(nil, 3) + require.True(t, ic.Insert(is3, 3)) + require.Equal(t, is1, ic.GetByVersion(1)) + require.Equal(t, is2, ic.GetByVersion(2)) + require.Equal(t, is3, ic.GetByVersion(3)) + + ic.ReSize(1) + require.Equal(t, 1, ic.Size()) + require.Nil(t, ic.GetByVersion(1)) + require.Nil(t, ic.GetByVersion(2)) + require.Equal(t, is3, ic.GetByVersion(3)) + require.False(t, ic.Insert(is2, 2)) + require.Equal(t, 1, ic.Size()) + is4 := infoschema.MockInfoSchemaWithSchemaVer(nil, 4) + require.True(t, ic.Insert(is4, 4)) + require.Equal(t, 1, ic.Size()) + require.Nil(t, ic.GetByVersion(1)) + require.Nil(t, ic.GetByVersion(2)) + require.Nil(t, ic.GetByVersion(3)) + require.Equal(t, is4, ic.GetByVersion(4)) +} diff --git a/metrics/domain.go b/metrics/domain.go index ab6d9df5ae0d8..a76062a811991 100644 --- a/metrics/domain.go +++ b/metrics/domain.go @@ -30,14 +30,14 @@ var ( }, []string{LblType}) // LoadSchemaDuration records the duration of load schema. - LoadSchemaDuration = prometheus.NewHistogram( + LoadSchemaDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Namespace: "tidb", Subsystem: "domain", Name: "load_schema_duration_seconds", Help: "Bucketed histogram of processing time (s) in load schema.", Buckets: prometheus.ExponentialBuckets(0.001, 2, 20), // 1ms ~ 524s - }) + }, []string{LblAction}) // InfoCacheCounters are the counters of get/hit. InfoCacheCounters = prometheus.NewCounterVec( diff --git a/metrics/grafana/tidb.json b/metrics/grafana/tidb.json index 9e2f362c74c84..6ee35f83bbb85 100644 --- a/metrics/grafana/tidb.json +++ b/metrics/grafana/tidb.json @@ -11670,13 +11670,15 @@ "legend": { "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "max": true, "min": false, "rightSide": true, "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, @@ -11696,10 +11698,10 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tidb_domain_load_schema_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", + "expr": "histogram_quantile(0.99, sum(rate(tidb_domain_load_schema_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, action))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}}", + "legendFormat": "{{action}}", "metric": "", "refId": "A", "step": 10 @@ -11709,7 +11711,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Load Schema Duration", + "title": "Load Schema Action Duration", "tooltip": { "msResolution": false, "shared": true, @@ -11881,13 +11883,15 @@ "legend": { "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "max": true, "min": false, "rightSide": true, "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, @@ -11944,7 +11948,7 @@ { "format": "short", "label": null, - "logBase": 10, + "logBase": 1, "max": null, "min": null, "show": true @@ -12075,6 +12079,114 @@ "align": false, "alignLevel": null } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "TiDB schema cache operations per second.", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 24 + }, + "hiddenSeries": false, + "id": 314, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(tidb_domain_infocache_counters{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (action,type)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{action}}-{{type}}", + "metric": "", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Schema Cache OPS", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } ], "repeat": null, diff --git a/sessionctx/variable/sysvar.go b/sessionctx/variable/sysvar.go index b45c13e90ef45..f0d3d56eab09b 100644 --- a/sessionctx/variable/sysvar.go +++ b/sessionctx/variable/sysvar.go @@ -2297,6 +2297,11 @@ var defaultSysVars = []*SysVar{ return nil }, }, + {Scope: ScopeGlobal, Name: TiDBSchemaVersionCacheLimit, Value: strconv.Itoa(DefTiDBSchemaVersionCacheLimit), Type: TypeInt, MinValue: 2, MaxValue: math.MaxUint8, AllowEmpty: true, + SetGlobal: func(_ context.Context, s *SessionVars, val string) error { + SchemaVersionCacheLimit.Store(TidbOptInt64(val, DefTiDBSchemaVersionCacheLimit)) + return nil + }}, } // FeedbackProbability points to the FeedbackProbability in statistics package. diff --git a/sessionctx/variable/tidb_vars.go b/sessionctx/variable/tidb_vars.go index 086f32a685839..9daa390456a03 100644 --- a/sessionctx/variable/tidb_vars.go +++ b/sessionctx/variable/tidb_vars.go @@ -907,6 +907,8 @@ const ( PasswordReuseHistory = "password_history" // PasswordReuseTime limit how long passwords can be reused. PasswordReuseTime = "password_reuse_interval" + // TiDBSchemaVersionCacheLimit defines the capacity size of domain infoSchema cache. + TiDBSchemaVersionCacheLimit = "tidb_schema_version_cache_limit" ) // TiDB intentional limits @@ -1167,6 +1169,7 @@ const ( DefTiDBTTLJobScheduleWindowEndTime = "23:59 +0000" DefTiDBTTLScanWorkerCount = 4 DefTiDBTTLDeleteWorkerCount = 4 + DefTiDBSchemaVersionCacheLimit = 64 ) // Process global variables. @@ -1242,6 +1245,7 @@ var ( PasswordReuseInterval = atomic.NewInt64(DefPasswordReuseTime) IsSandBoxModeEnabled = atomic.NewBool(false) MaxPreparedStmtCountValue = atomic.NewInt64(DefMaxPreparedStmtCount) + SchemaVersionCacheLimit = atomic.NewInt64(DefTiDBSchemaVersionCacheLimit) ) var (