From f82ca6a8eba175348ac8b633905b89a567597532 Mon Sep 17 00:00:00 2001 From: Zhou Kunqin <25057648+time-and-fate@users.noreply.github.com> Date: Tue, 7 Dec 2021 19:45:56 +0800 Subject: [PATCH] statistics, util/ranger: add cardinality estimation trace for `GetRowCountBy...` (#30321) --- planner/core/rule_partition_processor.go | 6 +- statistics/table.go | 75 ++++++++++- statistics/testdata/trace_suite_out.json | 134 +++++++++++++++++++- util/ranger/ranger.go | 155 +++++++++++++++++++++++ util/ranger/types.go | 13 +- 5 files changed, 364 insertions(+), 19 deletions(-) diff --git a/planner/core/rule_partition_processor.go b/planner/core/rule_partition_processor.go index bb57b0fac33da..7c3bbb565c69d 100644 --- a/planner/core/rule_partition_processor.go +++ b/planner/core/rule_partition_processor.go @@ -140,7 +140,7 @@ func (s *partitionProcessor) findUsedPartitions(ctx sessionctx.Context, tbl tabl ranges := detachedResult.Ranges used := make([]int, 0, len(ranges)) for _, r := range ranges { - if r.IsPointNullable(ctx) { + if r.IsPointNullable(ctx.GetSessionVars().StmtCtx) { if !r.HighVal[0].IsNull() { if len(r.HighVal) != len(partIdx) { used = []int{-1} @@ -473,7 +473,7 @@ func (l *listPartitionPruner) locateColumnPartitionsByCondition(cond expression. return nil, true, nil } var locations []tables.ListPartitionLocation - if r.IsPointNullable(l.ctx) { + if r.IsPointNullable(l.ctx.GetSessionVars().StmtCtx) { location, err := colPrune.LocatePartition(sc, r.HighVal[0]) if types.ErrOverflow.Equal(err) { return nil, true, nil // return full-scan if over-flow @@ -555,7 +555,7 @@ func (l *listPartitionPruner) findUsedListPartitions(conds []expression.Expressi } used := make(map[int]struct{}, len(ranges)) for _, r := range ranges { - if r.IsPointNullable(l.ctx) { + if r.IsPointNullable(l.ctx.GetSessionVars().StmtCtx) { if len(r.HighVal) != len(exprCols) { return l.fullRange, nil } diff --git a/statistics/table.go b/statistics/table.go index e0fabb74f8bd1..73c1d608fc403 100644 --- a/statistics/table.go +++ b/statistics/table.go @@ -33,8 +33,11 @@ import ( "github.com/pingcap/tidb/types" "github.com/pingcap/tidb/util/chunk" "github.com/pingcap/tidb/util/codec" + "github.com/pingcap/tidb/util/logutil" "github.com/pingcap/tidb/util/ranger" + "github.com/pingcap/tidb/util/tracing" "go.uber.org/atomic" + "go.uber.org/zap" ) const ( @@ -331,17 +334,26 @@ func (t *Table) ColumnEqualRowCount(sc *stmtctx.StatementContext, value types.Da // GetRowCountByIntColumnRanges estimates the row count by a slice of IntColumnRange. func (coll *HistColl) GetRowCountByIntColumnRanges(sc *stmtctx.StatementContext, colID int64, intRanges []*ranger.Range) (float64, error) { + var result float64 c, ok := coll.Columns[colID] if !ok || c.IsInvalid(sc, coll.Pseudo) { if len(intRanges) == 0 { return 0, nil } if intRanges[0].LowVal[0].Kind() == types.KindInt64 { - return getPseudoRowCountBySignedIntRanges(intRanges, float64(coll.Count)), nil + result = getPseudoRowCountBySignedIntRanges(intRanges, float64(coll.Count)) + } else { + result = getPseudoRowCountByUnsignedIntRanges(intRanges, float64(coll.Count)) + } + if sc.EnableOptimizerCETrace && ok { + CETraceRange(sc, coll.PhysicalID, []string{c.Info.Name.O}, intRanges, "Column Stats-Pseudo", uint64(result)) } - return getPseudoRowCountByUnsignedIntRanges(intRanges, float64(coll.Count)), nil + return result, nil } result, err := c.GetColumnRowCount(sc, intRanges, coll.Count, true) + if sc.EnableOptimizerCETrace { + CETraceRange(sc, coll.PhysicalID, []string{c.Info.Name.O}, intRanges, "Column Stats", uint64(result)) + } return result, errors.Trace(err) } @@ -349,21 +361,38 @@ func (coll *HistColl) GetRowCountByIntColumnRanges(sc *stmtctx.StatementContext, func (coll *HistColl) GetRowCountByColumnRanges(sc *stmtctx.StatementContext, colID int64, colRanges []*ranger.Range) (float64, error) { c, ok := coll.Columns[colID] if !ok || c.IsInvalid(sc, coll.Pseudo) { - return GetPseudoRowCountByColumnRanges(sc, float64(coll.Count), colRanges, 0) + result, err := GetPseudoRowCountByColumnRanges(sc, float64(coll.Count), colRanges, 0) + if err == nil && sc.EnableOptimizerCETrace && ok { + CETraceRange(sc, coll.PhysicalID, []string{c.Info.Name.O}, colRanges, "Column Stats-Pseudo", uint64(result)) + } + return result, err } result, err := c.GetColumnRowCount(sc, colRanges, coll.Count, false) + if sc.EnableOptimizerCETrace { + CETraceRange(sc, coll.PhysicalID, []string{c.Info.Name.O}, colRanges, "Column Stats", uint64(result)) + } return result, errors.Trace(err) } // GetRowCountByIndexRanges estimates the row count by a slice of Range. func (coll *HistColl) GetRowCountByIndexRanges(sc *stmtctx.StatementContext, idxID int64, indexRanges []*ranger.Range) (float64, error) { - idx := coll.Indices[idxID] - if idx == nil || idx.IsInvalid(coll.Pseudo) { + idx, ok := coll.Indices[idxID] + colNames := make([]string, 0, 8) + if ok { + for _, col := range idx.Info.Columns { + colNames = append(colNames, col.Name.O) + } + } + if !ok || idx.IsInvalid(coll.Pseudo) { colsLen := -1 if idx != nil && idx.Info.Unique { colsLen = len(idx.Info.Columns) } - return getPseudoRowCountByIndexRanges(sc, indexRanges, float64(coll.Count), colsLen) + result, err := getPseudoRowCountByIndexRanges(sc, indexRanges, float64(coll.Count), colsLen) + if err == nil && sc.EnableOptimizerCETrace && ok { + CETraceRange(sc, coll.PhysicalID, colNames, indexRanges, "Index Stats-Pseudo", uint64(result)) + } + return result, err } var result float64 var err error @@ -372,9 +401,43 @@ func (coll *HistColl) GetRowCountByIndexRanges(sc *stmtctx.StatementContext, idx } else { result, err = idx.GetRowCount(sc, coll, indexRanges, coll.Count) } + if sc.EnableOptimizerCETrace { + CETraceRange(sc, coll.PhysicalID, colNames, indexRanges, "Index Stats", uint64(result)) + } return result, errors.Trace(err) } +// CETraceRange appends a list of ranges and related information into CE trace +func CETraceRange(sc *stmtctx.StatementContext, tableID int64, colNames []string, ranges []*ranger.Range, tp string, rowCount uint64) { + allPoint := true + for _, ran := range ranges { + if !ran.IsPointNullable(sc) { + allPoint = false + break + } + } + if allPoint { + tp = tp + "-Point" + } else { + tp = tp + "-Range" + } + expr, err := ranger.RangesToString(sc, ranges, colNames) + if err != nil { + logutil.BgLogger().Debug("[OptimizerTrace] Failed to trace CE of ranges", zap.Error(err)) + } + // We don't need to record meaningless expressions. + if expr == "" || expr == "true" || expr == "false" { + return + } + CERecord := tracing.CETraceRecord{ + TableID: tableID, + Type: tp, + Expr: expr, + RowCount: rowCount, + } + sc.OptimizerCETrace = append(sc.OptimizerCETrace, &CERecord) +} + // PseudoAvgCountPerValue gets a pseudo average count if histogram not exists. func (t *Table) PseudoAvgCountPerValue() float64 { return float64(t.Count) / pseudoEqualRate diff --git a/statistics/testdata/trace_suite_out.json b/statistics/testdata/trace_suite_out.json index d45173d34d24c..9d13e199263a3 100644 --- a/statistics/testdata/trace_suite_out.json +++ b/statistics/testdata/trace_suite_out.json @@ -5,6 +5,41 @@ { "Expr": "a > 0 and a < 2", "Trace": [ + { + "TableID": 57, + "TableName": "", + "Type": "Column Stats-Point", + "Expr": "((a = 1))", + "RowCount": 4 + }, + { + "TableID": 57, + "TableName": "", + "Type": "Index Stats-Point", + "Expr": "((a = 1))", + "RowCount": 4 + }, + { + "TableID": 57, + "TableName": "", + "Type": "Column Stats-Range", + "Expr": "((a > 0 and a < 2))", + "RowCount": 4 + }, + { + "TableID": 57, + "TableName": "", + "Type": "Column Stats-Point", + "Expr": "((a = 1))", + "RowCount": 4 + }, + { + "TableID": 57, + "TableName": "", + "Type": "Index Stats-Point", + "Expr": "((a = 1))", + "RowCount": 4 + }, { "TableID": 57, "TableName": "", @@ -24,6 +59,27 @@ { "Expr": "a >= 1 and a < 10", "Trace": [ + { + "TableID": 57, + "TableName": "", + "Type": "Index Stats-Range", + "Expr": "((a >= 1 and a < 10))", + "RowCount": 6 + }, + { + "TableID": 57, + "TableName": "", + "Type": "Column Stats-Range", + "Expr": "((a >= 1 and a < 10))", + "RowCount": 6 + }, + { + "TableID": 57, + "TableName": "", + "Type": "Index Stats-Range", + "Expr": "((a >= 1 and a < 10))", + "RowCount": 6 + }, { "TableID": 57, "TableName": "", @@ -43,6 +99,20 @@ { "Expr": "a < 3 or b < 4", "Trace": [ + { + "TableID": 57, + "TableName": "", + "Type": "Column Stats-Range", + "Expr": "((a < 3))", + "RowCount": 6 + }, + { + "TableID": 57, + "TableName": "", + "Type": "Index Stats-Range", + "Expr": "((a < 3))", + "RowCount": 6 + }, { "TableID": 57, "TableName": "", @@ -64,6 +134,13 @@ "Expr": "`or`(`lt`(test.t.a, 3), `lt`(test.t.b, 4))", "RowCount": 6 }, + { + "TableID": 57, + "TableName": "", + "Type": "Column Stats-Range", + "Expr": "((b < 4))", + "RowCount": 6 + }, { "TableID": 57, "TableName": "", @@ -99,18 +176,25 @@ "Expr": "`or`(`lt`(test.t.a, 3), `lt`(test.t.b, 4))", "RowCount": 6 }, + { + "TableID": 57, + "TableName": "", + "Type": "Column Stats-Range", + "Expr": "((b < 4))", + "RowCount": 6 + }, { "TableID": 57, "TableName": "", "Type": "Table Stats-Expression-CNF", - "Expr": "`lt`(test.t.a, 3)", + "Expr": "`lt`(test.t.b, 4)", "RowCount": 6 }, { "TableID": 57, "TableName": "", "Type": "Table Stats-Expression-CNF", - "Expr": "`lt`(test.t.a, 3)", + "Expr": "`lt`(test.t.b, 4)", "RowCount": 6 }, { @@ -120,18 +204,32 @@ "Expr": "`or`(`lt`(test.t.a, 3), `lt`(test.t.b, 4))", "RowCount": 6 }, + { + "TableID": 57, + "TableName": "", + "Type": "Column Stats-Range", + "Expr": "((a < 3))", + "RowCount": 6 + }, + { + "TableID": 57, + "TableName": "", + "Type": "Index Stats-Range", + "Expr": "((a < 3))", + "RowCount": 6 + }, { "TableID": 57, "TableName": "", "Type": "Table Stats-Expression-CNF", - "Expr": "`lt`(test.t.b, 4)", + "Expr": "`lt`(test.t.a, 3)", "RowCount": 6 }, { "TableID": 57, "TableName": "", "Type": "Table Stats-Expression-CNF", - "Expr": "`lt`(test.t.b, 4)", + "Expr": "`lt`(test.t.a, 3)", "RowCount": 6 }, { @@ -160,6 +258,34 @@ { "Expr": "a = 1 and b = 2", "Trace": [ + { + "TableID": 57, + "TableName": "", + "Type": "Index Stats-Point", + "Expr": "((a = 1) and (b = 2))", + "RowCount": 2 + }, + { + "TableID": 57, + "TableName": "", + "Type": "Column Stats-Point", + "Expr": "((a = 1))", + "RowCount": 4 + }, + { + "TableID": 57, + "TableName": "", + "Type": "Column Stats-Point", + "Expr": "((b = 2))", + "RowCount": 3 + }, + { + "TableID": 57, + "TableName": "", + "Type": "Index Stats-Point", + "Expr": "((a = 1) and (b = 2))", + "RowCount": 2 + }, { "TableID": 57, "TableName": "", diff --git a/util/ranger/ranger.go b/util/ranger/ranger.go index 0e39a228ca9a3..7d7cbc9b41a5b 100644 --- a/util/ranger/ranger.go +++ b/util/ranger/ranger.go @@ -17,6 +17,7 @@ package ranger import ( "bytes" "math" + "regexp" "sort" "unicode/utf8" @@ -25,10 +26,13 @@ import ( "github.com/pingcap/tidb/kv" "github.com/pingcap/tidb/parser/ast" "github.com/pingcap/tidb/parser/charset" + "github.com/pingcap/tidb/parser/format" "github.com/pingcap/tidb/parser/mysql" "github.com/pingcap/tidb/parser/terror" "github.com/pingcap/tidb/sessionctx" + "github.com/pingcap/tidb/sessionctx/stmtctx" "github.com/pingcap/tidb/types" + driver "github.com/pingcap/tidb/types/parser_driver" "github.com/pingcap/tidb/util/codec" ) @@ -596,3 +600,154 @@ func DetachCondAndBuildRangeForPartition(sctx sessionctx.Context, conditions []e } return d.detachCondAndBuildRangeForCols() } + +// RangesToString print a list of Ranges into a string which can appear in an SQL as a condition. +func RangesToString(sc *stmtctx.StatementContext, rans []*Range, colNames []string) (string, error) { + for _, ran := range rans { + if len(ran.LowVal) != len(ran.HighVal) { + return "", errors.New("range length mismatch") + } + } + var buffer bytes.Buffer + for i, ran := range rans { + buffer.WriteString("(") + for j := range ran.LowVal { + buffer.WriteString("(") + + // The `Exclude` information is only useful for the last columns. + // If it's not the last column, it should always be false, which means it's inclusive. + lowExclude := false + if ran.LowExclude && j == len(ran.LowVal)-1 { + lowExclude = true + } + highExclude := false + if ran.HighExclude && j == len(ran.LowVal)-1 { + highExclude = true + } + + // sanity check: only last column of the `Range` can be an interval + if j < len(ran.LowVal)-1 { + cmp, err := ran.LowVal[j].CompareDatum(sc, &ran.HighVal[j]) + if err != nil { + return "", errors.New("comparing values error: " + err.Error()) + } + if cmp != 0 { + return "", errors.New("unexpected form of range") + } + } + + str, err := RangeSingleColToString(sc, ran.LowVal[j], ran.HighVal[j], lowExclude, highExclude, colNames[j]) + if err != nil { + return "false", err + } + buffer.WriteString(str) + buffer.WriteString(")") + if j < len(ran.LowVal)-1 { + // Conditions on different columns of a range are implicitly connected with AND. + buffer.WriteString(" and ") + } + } + buffer.WriteString(")") + if i < len(rans)-1 { + // Conditions of different ranges are implicitly connected with OR. + buffer.WriteString(" or ") + } + } + result := buffer.String() + + // Simplify some useless conditions. + if matched, err := regexp.MatchString(`^\(*true\)*$`, result); matched || (err != nil) { + return "true", nil + } + return result, nil +} + +// RangeSingleColToString prints a single column of a Range into a string which can appear in an SQL as a condition. +func RangeSingleColToString(sc *stmtctx.StatementContext, lowVal, highVal types.Datum, lowExclude, highExclude bool, colName string) (string, error) { + // case 1: low and high are both special values(null, min not null, max value) + lowKind := lowVal.Kind() + highKind := highVal.Kind() + if (lowKind == types.KindNull || lowKind == types.KindMinNotNull || lowKind == types.KindMaxValue) && + (highKind == types.KindNull || highKind == types.KindMinNotNull || highKind == types.KindMaxValue) { + if lowKind == types.KindNull && highKind == types.KindNull && !lowExclude && !highExclude { + return colName + " is null", nil + } + if lowKind == types.KindNull && highKind == types.KindMaxValue && !lowExclude { + return "true", nil + } + if lowKind == types.KindMinNotNull && highKind == types.KindMaxValue { + return colName + " is not null", nil + } + return "false", nil + } + + var buf bytes.Buffer + restoreCtx := format.NewRestoreCtx(format.DefaultRestoreFlags, &buf) + + // case 2: low value and high value are the same, and low value and high value are both inclusive. + cmp, err := lowVal.CompareDatum(sc, &highVal) + if err != nil { + return "false", errors.Trace(err) + } + if cmp == 0 && !lowExclude && !highExclude && !lowVal.IsNull() { + buf.WriteString(colName) + buf.WriteString(" = ") + lowValExpr := driver.ValueExpr{Datum: lowVal} + err := lowValExpr.Restore(restoreCtx) + if err != nil { + return "false", errors.Trace(err) + } + return buf.String(), nil + } + + // case 3: it's an interval. + useOR := false + noLowerPart := false + + // Handle the low value part. + if lowKind == types.KindNull { + buf.WriteString(colName + " is null") + useOR = true + } else if lowKind == types.KindMinNotNull { + noLowerPart = true + } else { + buf.WriteString(colName) + if lowExclude { + buf.WriteString(" > ") + } else { + buf.WriteString(" >= ") + } + lowValExpr := driver.ValueExpr{Datum: lowVal} + err := lowValExpr.Restore(restoreCtx) + if err != nil { + return "false", errors.Trace(err) + } + } + + if !noLowerPart { + if useOR { + buf.WriteString(" or ") + } else { + buf.WriteString(" and ") + } + } + + // Handle the high value part + if highKind == types.KindMaxValue { + buf.WriteString("true") + } else { + buf.WriteString(colName) + if highExclude { + buf.WriteString(" < ") + } else { + buf.WriteString(" <= ") + } + highValExpr := driver.ValueExpr{Datum: highVal} + err := highValExpr.Restore(restoreCtx) + if err != nil { + return "false", errors.Trace(err) + } + } + + return buf.String(), nil +} diff --git a/util/ranger/types.go b/util/ranger/types.go index c950419cf217f..2e8cc1dc6120d 100644 --- a/util/ranger/types.go +++ b/util/ranger/types.go @@ -82,10 +82,10 @@ func (ran *Range) Clone() *Range { // IsPoint returns if the range is a point. func (ran *Range) IsPoint(sctx sessionctx.Context) bool { - return ran.isPoint(sctx, sctx.GetSessionVars().RegardNULLAsPoint) + return ran.isPoint(sctx.GetSessionVars().StmtCtx, sctx.GetSessionVars().RegardNULLAsPoint) } -func (ran *Range) isPoint(sctx sessionctx.Context, regardNullAsPoint bool) bool { +func (ran *Range) isPoint(stmtCtx *stmtctx.StatementContext, regardNullAsPoint bool) bool { if len(ran.LowVal) != len(ran.HighVal) { return false } @@ -95,7 +95,7 @@ func (ran *Range) isPoint(sctx sessionctx.Context, regardNullAsPoint bool) bool if a.Kind() == types.KindMinNotNull || b.Kind() == types.KindMaxValue { return false } - cmp, err := a.CompareDatum(sctx.GetSessionVars().StmtCtx, &b) + cmp, err := a.CompareDatum(stmtCtx, &b) if err != nil { return false } @@ -114,12 +114,13 @@ func (ran *Range) isPoint(sctx sessionctx.Context, regardNullAsPoint bool) bool // IsPointNonNullable returns if the range is a point without NULL. func (ran *Range) IsPointNonNullable(sctx sessionctx.Context) bool { - return ran.isPoint(sctx, false) + return ran.isPoint(sctx.GetSessionVars().StmtCtx, false) } // IsPointNullable returns if the range is a point. -func (ran *Range) IsPointNullable(sctx sessionctx.Context) bool { - return ran.isPoint(sctx, true) +// TODO: unify the parameter type with IsPointNullable and IsPoint +func (ran *Range) IsPointNullable(stmtCtx *stmtctx.StatementContext) bool { + return ran.isPoint(stmtCtx, true) } // IsFullRange check if the range is full scan range