From 99c19071bfc7038f9f4a56335dc0c983c5582e05 Mon Sep 17 00:00:00 2001 From: Rafer Hazen Date: Wed, 12 Jul 2023 16:32:33 -0600 Subject: [PATCH 1/2] Add QueryTimingsByTabletType Added instrumentation alongside the regular QueryTimings, except for in the 2PC related callsites which lack access to TabletType. Since we're not using 2PC for now this is ok, but will need to add them if we do endn up using 2PC for real things. Signed-off-by: Rafer Hazen --- go/vt/vttablet/tabletserver/query_executor.go | 3 +++ go/vt/vttablet/tabletserver/tabletenv/stats.go | 4 ++++ go/vt/vttablet/tabletserver/tabletserver.go | 8 ++++++++ 3 files changed, 15 insertions(+) diff --git a/go/vt/vttablet/tabletserver/query_executor.go b/go/vt/vttablet/tabletserver/query_executor.go index 8dd35d32db7..edd786492e9 100644 --- a/go/vt/vttablet/tabletserver/query_executor.go +++ b/go/vt/vttablet/tabletserver/query_executor.go @@ -124,6 +124,7 @@ func (qre *QueryExecutor) Execute() (reply *sqltypes.Result, err error) { defer func(start time.Time) { duration := time.Since(start) qre.tsv.stats.QueryTimings.Add(planName, duration) + qre.tsv.stats.QueryTimingsByTabletType.Add(qre.tabletType.String(), duration) qre.recordUserQuery("Execute", int64(duration)) mysqlTime := qre.logStats.MysqlResponseTime @@ -313,6 +314,7 @@ func (qre *QueryExecutor) Stream(callback StreamCallback) error { defer func(start time.Time) { qre.tsv.stats.QueryTimings.Record(qre.plan.PlanID.String(), start) + qre.tsv.stats.QueryTimingsByTabletType.Record(qre.tabletType.String(), start) qre.recordUserQuery("Stream", int64(time.Since(start))) }(time.Now()) @@ -402,6 +404,7 @@ func (qre *QueryExecutor) MessageStream(callback StreamCallback) error { defer func(start time.Time) { qre.tsv.stats.QueryTimings.Record(qre.plan.PlanID.String(), start) + qre.tsv.stats.QueryTimingsByTabletType.Record(qre.tabletType.String(), start) qre.recordUserQuery("MessageStream", int64(time.Since(start))) }(time.Now()) diff --git a/go/vt/vttablet/tabletserver/tabletenv/stats.go b/go/vt/vttablet/tabletserver/tabletenv/stats.go index 30510f9c32e..17b7101402d 100644 --- a/go/vt/vttablet/tabletserver/tabletenv/stats.go +++ b/go/vt/vttablet/tabletserver/tabletenv/stats.go @@ -47,6 +47,8 @@ type Stats struct { UserActiveReservedCount *stats.CountersWithSingleLabel // Per CallerID active reserved connection counts UserReservedCount *stats.CountersWithSingleLabel // Per CallerID reserved connection counts UserReservedTimesNs *stats.CountersWithSingleLabel // Per CallerID reserved connection duration + + QueryTimingsByTabletType *servenv.TimingsWrapper // Query timings split by current tablet type } // NewStats instantiates a new set of stats scoped by exporter. @@ -94,6 +96,8 @@ func NewStats(exporter *servenv.Exporter) *Stats { UserActiveReservedCount: exporter.NewCountersWithSingleLabel("UserActiveReservedCount", "active reserved connection for each CallerID", "CallerID"), UserReservedCount: exporter.NewCountersWithSingleLabel("UserReservedCount", "reserved connection received for each CallerID", "CallerID"), UserReservedTimesNs: exporter.NewCountersWithSingleLabel("UserReservedTimesNs", "Total reserved connection latency for each CallerID", "CallerID"), + + QueryTimingsByTabletType: exporter.NewTimings("QueryTimingsByTabletType", "Query timings broken down by active tablet type", "TabletType"), } stats.QPSRates = exporter.NewRates("QPS", stats.QueryTimings, 15*60/5, 5*time.Second) return stats diff --git a/go/vt/vttablet/tabletserver/tabletserver.go b/go/vt/vttablet/tabletserver/tabletserver.go index 09091023168..6216afc9930 100644 --- a/go/vt/vttablet/tabletserver/tabletserver.go +++ b/go/vt/vttablet/tabletserver/tabletserver.go @@ -518,6 +518,7 @@ func (tsv *TabletServer) begin(ctx context.Context, target *querypb.Target, save logStats.OriginalSQL = beginSQL if beginSQL != "" { tsv.stats.QueryTimings.Record("BEGIN", startTime) + tsv.stats.QueryTimingsByTabletType.Record(target.TabletType.String(), startTime) } else { logStats.Method = "" } @@ -573,6 +574,7 @@ func (tsv *TabletServer) Commit(ctx context.Context, target *querypb.Target, tra // handlePanicAndSendLogStats doesn't log the no-op. if commitSQL != "" { tsv.stats.QueryTimings.Record("COMMIT", startTime) + tsv.stats.QueryTimingsByTabletType.Record(target.TabletType.String(), startTime) } else { logStats.Method = "" } @@ -590,6 +592,7 @@ func (tsv *TabletServer) Rollback(ctx context.Context, target *querypb.Target, t target, nil, true, /* allowOnShutdown */ func(ctx context.Context, logStats *tabletenv.LogStats) error { defer tsv.stats.QueryTimings.Record("ROLLBACK", time.Now()) + defer tsv.stats.QueryTimingsByTabletType.Record(target.TabletType.String(), time.Now()) logStats.TransactionID = transactionID newReservedID, err = tsv.te.Rollback(ctx, transactionID) if newReservedID > 0 { @@ -1198,6 +1201,7 @@ func (tsv *TabletServer) ReserveBeginExecute(ctx context.Context, target *queryp target, options, false, /* allowOnShutdown */ func(ctx context.Context, logStats *tabletenv.LogStats) error { defer tsv.stats.QueryTimings.Record("RESERVE", time.Now()) + defer tsv.stats.QueryTimingsByTabletType.Record(target.TabletType.String(), time.Now()) connID, sessionStateChanges, err = tsv.te.ReserveBegin(ctx, options, preQueries, postBeginQueries) if err != nil { return err @@ -1243,6 +1247,7 @@ func (tsv *TabletServer) ReserveBeginStreamExecute( target, options, false, /* allowOnShutdown */ func(ctx context.Context, logStats *tabletenv.LogStats) error { defer tsv.stats.QueryTimings.Record("RESERVE", time.Now()) + defer tsv.stats.QueryTimingsByTabletType.Record(target.TabletType.String(), time.Now()) connID, sessionStateChanges, err = tsv.te.ReserveBegin(ctx, options, preQueries, postBeginQueries) if err != nil { return err @@ -1296,6 +1301,7 @@ func (tsv *TabletServer) ReserveExecute(ctx context.Context, target *querypb.Tar target, options, allowOnShutdown, func(ctx context.Context, logStats *tabletenv.LogStats) error { defer tsv.stats.QueryTimings.Record("RESERVE", time.Now()) + defer tsv.stats.QueryTimingsByTabletType.Record(target.TabletType.String(), time.Now()) state.ReservedID, err = tsv.te.Reserve(ctx, options, transactionID, preQueries) if err != nil { return err @@ -1346,6 +1352,7 @@ func (tsv *TabletServer) ReserveStreamExecute( target, options, allowOnShutdown, func(ctx context.Context, logStats *tabletenv.LogStats) error { defer tsv.stats.QueryTimings.Record("RESERVE", time.Now()) + defer tsv.stats.QueryTimingsByTabletType.Record(target.TabletType.String(), time.Now()) state.ReservedID, err = tsv.te.Reserve(ctx, options, transactionID, preQueries) if err != nil { return err @@ -1375,6 +1382,7 @@ func (tsv *TabletServer) Release(ctx context.Context, target *querypb.Target, tr target, nil, true, /* allowOnShutdown */ func(ctx context.Context, logStats *tabletenv.LogStats) error { defer tsv.stats.QueryTimings.Record("RELEASE", time.Now()) + defer tsv.stats.QueryTimingsByTabletType.Record(target.TabletType.String(), time.Now()) logStats.TransactionID = transactionID logStats.ReservedID = reservedID if reservedID != 0 { From 5475faa7e3e137902f7d6c3935a409a4d257e333 Mon Sep 17 00:00:00 2001 From: Rafer Hazen Date: Thu, 13 Jul 2023 11:18:51 -0600 Subject: [PATCH 2/2] Add QueryCountsWithTabletType metric Otherwise identical to QueryCounts. Added a new metric here instead of adding a tag to the existing metric to avoid breaking anything that may rely on those metrics or /debug/vars. Signed-off-by: Rafer Hazen --- go/vt/vttablet/tabletserver/query_engine.go | 16 ++++--- .../tabletserver/query_engine_test.go | 43 ++++++++++++++++++- go/vt/vttablet/tabletserver/query_executor.go | 5 ++- 3 files changed, 55 insertions(+), 9 deletions(-) diff --git a/go/vt/vttablet/tabletserver/query_engine.go b/go/vt/vttablet/tabletserver/query_engine.go index a59665450a1..a1185d16fee 100644 --- a/go/vt/vttablet/tabletserver/query_engine.go +++ b/go/vt/vttablet/tabletserver/query_engine.go @@ -27,12 +27,9 @@ import ( "sync/atomic" "time" - "vitess.io/vitess/go/mysql/sqlerror" - vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" - "vitess.io/vitess/go/vt/vterrors" - "vitess.io/vitess/go/acl" "vitess.io/vitess/go/cache" + "vitess.io/vitess/go/mysql/sqlerror" "vitess.io/vitess/go/pools" "vitess.io/vitess/go/stats" "vitess.io/vitess/go/streamlog" @@ -41,9 +38,12 @@ import ( "vitess.io/vitess/go/vt/dbconnpool" "vitess.io/vitess/go/vt/log" "vitess.io/vitess/go/vt/logutil" + "vitess.io/vitess/go/vt/proto/topodata" + vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" "vitess.io/vitess/go/vt/sqlparser" "vitess.io/vitess/go/vt/tableacl" tacl "vitess.io/vitess/go/vt/tableacl/acl" + "vitess.io/vitess/go/vt/vterrors" "vitess.io/vitess/go/vt/vttablet/tabletserver/connpool" "vitess.io/vitess/go/vt/vttablet/tabletserver/planbuilder" "vitess.io/vitess/go/vt/vttablet/tabletserver/rules" @@ -172,7 +172,7 @@ type QueryEngine struct { // stats // Note: queryErrorCountsWithCode is similar to queryErrorCounts except it contains error code as an additional dimension - queryCounts, queryTimes, queryErrorCounts, queryErrorCountsWithCode, queryRowsAffected, queryRowsReturned *stats.CountersWithMultiLabels + queryCounts, queryCountsWithTabletType, queryTimes, queryErrorCounts, queryErrorCountsWithCode, queryRowsAffected, queryRowsReturned *stats.CountersWithMultiLabels // stats flags enablePerWorkloadTableMetrics bool @@ -258,6 +258,7 @@ func NewQueryEngine(env tabletenv.Env, se *schema.Engine) *QueryEngine { } qe.queryCounts = env.Exporter().NewCountersWithMultiLabels("QueryCounts", "query counts", labels) + qe.queryCountsWithTabletType = env.Exporter().NewCountersWithMultiLabels("QueryCountsWithTabletType", "query counts with tablet type labels", []string{"Table", "Plan", "TabletType"}) qe.queryTimes = env.Exporter().NewCountersWithMultiLabels("QueryTimesNs", "query times in ns", labels) qe.queryRowsAffected = env.Exporter().NewCountersWithMultiLabels("QueryRowsAffected", "query rows affected", labels) qe.queryRowsReturned = env.Exporter().NewCountersWithMultiLabels("QueryRowsReturned", "query rows returned", labels) @@ -490,7 +491,7 @@ func (qe *QueryEngine) QueryPlanCacheLen() int { } // AddStats adds the given stats for the planName.tableName -func (qe *QueryEngine) AddStats(planType planbuilder.PlanType, tableName, workload string, queryCount int64, duration, mysqlTime time.Duration, rowsAffected, rowsReturned, errorCount int64, errorCode string) { +func (qe *QueryEngine) AddStats(planType planbuilder.PlanType, tableName, workload string, tabletType topodata.TabletType, queryCount int64, duration, mysqlTime time.Duration, rowsAffected, rowsReturned, errorCount int64, errorCode string) { // table names can contain "." characters, replace them! keys := []string{tableName, planType.String()} // Only use the workload as a label if that's enabled in the configuration. @@ -500,6 +501,9 @@ func (qe *QueryEngine) AddStats(planType planbuilder.PlanType, tableName, worklo qe.queryCounts.Add(keys, queryCount) qe.queryTimes.Add(keys, int64(duration)) qe.queryErrorCounts.Add(keys, errorCount) + + qe.queryCountsWithTabletType.Add([]string{tableName, planType.String(), tabletType.String()}, queryCount) + // queryErrorCountsWithCode is similar to queryErrorCounts except we have an additional dimension // of error code. if errorCount > 0 { diff --git a/go/vt/vttablet/tabletserver/query_engine_test.go b/go/vt/vttablet/tabletserver/query_engine_test.go index 33849c3f5e6..e073b6f8c49 100644 --- a/go/vt/vttablet/tabletserver/query_engine_test.go +++ b/go/vt/vttablet/tabletserver/query_engine_test.go @@ -32,6 +32,8 @@ import ( "testing" "time" + "vitess.io/vitess/go/vt/proto/topodata" + "vitess.io/vitess/go/vt/sqlparser" "vitess.io/vitess/go/mysql" @@ -577,6 +579,7 @@ func TestAddQueryStats(t *testing.T) { name string planType planbuilder.PlanType tableName string + tabletType topodata.TabletType queryCount int64 duration time.Duration mysqlTime time.Duration @@ -587,6 +590,7 @@ func TestAddQueryStats(t *testing.T) { enablePerWorkloadTableMetrics bool workload string expectedQueryCounts string + expectedQueryCountsWithTableType string expectedQueryTimes string expectedQueryRowsAffected string expectedQueryRowsReturned string @@ -597,6 +601,27 @@ func TestAddQueryStats(t *testing.T) { name: "select query", planType: planbuilder.PlanSelect, tableName: "A", + tabletType: topodata.TabletType_PRIMARY, + queryCount: 1, + duration: 10, + rowsAffected: 0, + rowsReturned: 15, + errorCount: 0, + errorCode: "OK", + enablePerWorkloadTableMetrics: false, + workload: "some-workload", + expectedQueryCounts: `{"A.Select": 1}`, + expectedQueryTimes: `{"A.Select": 10}`, + expectedQueryRowsAffected: `{}`, + expectedQueryRowsReturned: `{"A.Select": 15}`, + expectedQueryErrorCounts: `{"A.Select": 0}`, + expectedQueryErrorCountsWithCode: `{}`, + expectedQueryCountsWithTableType: `{"A.Select.PRIMARY": 1}`, + }, { + name: "select query against a replica", + planType: planbuilder.PlanSelect, + tableName: "A", + tabletType: topodata.TabletType_REPLICA, queryCount: 1, duration: 10, rowsAffected: 0, @@ -611,10 +636,12 @@ func TestAddQueryStats(t *testing.T) { expectedQueryRowsReturned: `{"A.Select": 15}`, expectedQueryErrorCounts: `{"A.Select": 0}`, expectedQueryErrorCountsWithCode: `{}`, + expectedQueryCountsWithTableType: `{"A.Select.REPLICA": 1}`, }, { name: "select into query", planType: planbuilder.PlanSelect, tableName: "A", + tabletType: topodata.TabletType_PRIMARY, queryCount: 1, duration: 10, rowsAffected: 15, @@ -629,10 +656,12 @@ func TestAddQueryStats(t *testing.T) { expectedQueryRowsReturned: `{"A.Select": 0}`, expectedQueryErrorCounts: `{"A.Select": 0}`, expectedQueryErrorCountsWithCode: `{}`, + expectedQueryCountsWithTableType: `{"A.Select.PRIMARY": 1}`, }, { name: "error", planType: planbuilder.PlanSelect, tableName: "A", + tabletType: topodata.TabletType_PRIMARY, queryCount: 1, duration: 10, rowsAffected: 0, @@ -647,10 +676,12 @@ func TestAddQueryStats(t *testing.T) { expectedQueryRowsReturned: `{"A.Select": 0}`, expectedQueryErrorCounts: `{"A.Select": 1}`, expectedQueryErrorCountsWithCode: `{"A.Select.RESOURCE_EXHAUSTED": 1}`, + expectedQueryCountsWithTableType: `{"A.Select.PRIMARY": 1}`, }, { name: "insert query", planType: planbuilder.PlanInsert, tableName: "A", + tabletType: topodata.TabletType_PRIMARY, queryCount: 1, duration: 10, rowsAffected: 15, @@ -665,10 +696,12 @@ func TestAddQueryStats(t *testing.T) { expectedQueryRowsReturned: `{}`, expectedQueryErrorCounts: `{"A.Insert": 0}`, expectedQueryErrorCountsWithCode: `{}`, + expectedQueryCountsWithTableType: `{"A.Insert.PRIMARY": 1}`, }, { name: "select query with per workload metrics", planType: planbuilder.PlanSelect, tableName: "A", + tabletType: topodata.TabletType_PRIMARY, queryCount: 1, duration: 10, rowsAffected: 0, @@ -683,10 +716,12 @@ func TestAddQueryStats(t *testing.T) { expectedQueryRowsReturned: `{"A.Select.some-workload": 15}`, expectedQueryErrorCounts: `{"A.Select.some-workload": 0}`, expectedQueryErrorCountsWithCode: `{}`, + expectedQueryCountsWithTableType: `{"A.Select.PRIMARY": 1}`, }, { name: "select into query with per workload metrics", planType: planbuilder.PlanSelect, tableName: "A", + tabletType: topodata.TabletType_PRIMARY, queryCount: 1, duration: 10, rowsAffected: 15, @@ -701,10 +736,12 @@ func TestAddQueryStats(t *testing.T) { expectedQueryRowsReturned: `{"A.Select.some-workload": 0}`, expectedQueryErrorCounts: `{"A.Select.some-workload": 0}`, expectedQueryErrorCountsWithCode: `{}`, + expectedQueryCountsWithTableType: `{"A.Select.PRIMARY": 1}`, }, { name: "error with per workload metrics", planType: planbuilder.PlanSelect, tableName: "A", + tabletType: topodata.TabletType_PRIMARY, queryCount: 1, duration: 10, rowsAffected: 0, @@ -719,10 +756,12 @@ func TestAddQueryStats(t *testing.T) { expectedQueryRowsReturned: `{"A.Select.some-workload": 0}`, expectedQueryErrorCounts: `{"A.Select.some-workload": 1}`, expectedQueryErrorCountsWithCode: `{"A.Select.RESOURCE_EXHAUSTED": 1}`, + expectedQueryCountsWithTableType: `{"A.Select.PRIMARY": 1}`, }, { name: "insert query with per workload metrics", planType: planbuilder.PlanInsert, tableName: "A", + tabletType: topodata.TabletType_PRIMARY, queryCount: 1, duration: 10, rowsAffected: 15, @@ -737,6 +776,7 @@ func TestAddQueryStats(t *testing.T) { expectedQueryRowsReturned: `{}`, expectedQueryErrorCounts: `{"A.Insert.some-workload": 0}`, expectedQueryErrorCountsWithCode: `{}`, + expectedQueryCountsWithTableType: `{"A.Insert.PRIMARY": 1}`, }, } @@ -749,8 +789,9 @@ func TestAddQueryStats(t *testing.T) { env := tabletenv.NewEnv(config, "TestAddQueryStats_"+testcase.name) se := schema.NewEngine(env) qe := NewQueryEngine(env, se) - qe.AddStats(testcase.planType, testcase.tableName, testcase.workload, testcase.queryCount, testcase.duration, testcase.mysqlTime, testcase.rowsAffected, testcase.rowsReturned, testcase.errorCount, testcase.errorCode) + qe.AddStats(testcase.planType, testcase.tableName, testcase.workload, testcase.tabletType, testcase.queryCount, testcase.duration, testcase.mysqlTime, testcase.rowsAffected, testcase.rowsReturned, testcase.errorCount, testcase.errorCode) assert.Equal(t, testcase.expectedQueryCounts, qe.queryCounts.String()) + assert.Equal(t, testcase.expectedQueryCountsWithTableType, qe.queryCountsWithTabletType.String()) assert.Equal(t, testcase.expectedQueryTimes, qe.queryTimes.String()) assert.Equal(t, testcase.expectedQueryRowsAffected, qe.queryRowsAffected.String()) assert.Equal(t, testcase.expectedQueryRowsReturned, qe.queryRowsReturned.String()) diff --git a/go/vt/vttablet/tabletserver/query_executor.go b/go/vt/vttablet/tabletserver/query_executor.go index edd786492e9..c11c811935d 100644 --- a/go/vt/vttablet/tabletserver/query_executor.go +++ b/go/vt/vttablet/tabletserver/query_executor.go @@ -136,13 +136,14 @@ func (qre *QueryExecutor) Execute() (reply *sqltypes.Result, err error) { var errCode string vtErrorCode := vterrors.Code(err) errCode = vtErrorCode.String() + if reply == nil { - qre.tsv.qe.AddStats(qre.plan.PlanID, tableName, qre.options.GetWorkloadName(), 1, duration, mysqlTime, 0, 0, 1, errCode) + qre.tsv.qe.AddStats(qre.plan.PlanID, tableName, qre.options.GetWorkloadName(), qre.tabletType, 1, duration, mysqlTime, 0, 0, 1, errCode) qre.plan.AddStats(1, duration, mysqlTime, 0, 0, 1) return } - qre.tsv.qe.AddStats(qre.plan.PlanID, tableName, qre.options.GetWorkloadName(), 1, duration, mysqlTime, int64(reply.RowsAffected), int64(len(reply.Rows)), 0, errCode) + qre.tsv.qe.AddStats(qre.plan.PlanID, tableName, qre.options.GetWorkloadName(), qre.tabletType, 1, duration, mysqlTime, int64(reply.RowsAffected), int64(len(reply.Rows)), 0, errCode) qre.plan.AddStats(1, duration, mysqlTime, reply.RowsAffected, uint64(len(reply.Rows)), 0) qre.logStats.RowsAffected = int(reply.RowsAffected) qre.logStats.Rows = reply.Rows