From f82ca6a8eba175348ac8b633905b89a567597532 Mon Sep 17 00:00:00 2001
From: Zhou Kunqin <25057648+time-and-fate@users.noreply.github.com>
Date: Tue, 7 Dec 2021 19:45:56 +0800
Subject: [PATCH] statistics, util/ranger: add cardinality estimation trace for
 `GetRowCountBy...` (#30321)

---
 planner/core/rule_partition_processor.go |   6 +-
 statistics/table.go                      |  75 ++++++++++-
 statistics/testdata/trace_suite_out.json | 134 +++++++++++++++++++-
 util/ranger/ranger.go                    | 155 +++++++++++++++++++++++
 util/ranger/types.go                     |  13 +-
 5 files changed, 364 insertions(+), 19 deletions(-)

diff --git a/planner/core/rule_partition_processor.go b/planner/core/rule_partition_processor.go
index bb57b0fac33da..7c3bbb565c69d 100644
--- a/planner/core/rule_partition_processor.go
+++ b/planner/core/rule_partition_processor.go
@@ -140,7 +140,7 @@ func (s *partitionProcessor) findUsedPartitions(ctx sessionctx.Context, tbl tabl
 	ranges := detachedResult.Ranges
 	used := make([]int, 0, len(ranges))
 	for _, r := range ranges {
-		if r.IsPointNullable(ctx) {
+		if r.IsPointNullable(ctx.GetSessionVars().StmtCtx) {
 			if !r.HighVal[0].IsNull() {
 				if len(r.HighVal) != len(partIdx) {
 					used = []int{-1}
@@ -473,7 +473,7 @@ func (l *listPartitionPruner) locateColumnPartitionsByCondition(cond expression.
 			return nil, true, nil
 		}
 		var locations []tables.ListPartitionLocation
-		if r.IsPointNullable(l.ctx) {
+		if r.IsPointNullable(l.ctx.GetSessionVars().StmtCtx) {
 			location, err := colPrune.LocatePartition(sc, r.HighVal[0])
 			if types.ErrOverflow.Equal(err) {
 				return nil, true, nil // return full-scan if over-flow
@@ -555,7 +555,7 @@ func (l *listPartitionPruner) findUsedListPartitions(conds []expression.Expressi
 	}
 	used := make(map[int]struct{}, len(ranges))
 	for _, r := range ranges {
-		if r.IsPointNullable(l.ctx) {
+		if r.IsPointNullable(l.ctx.GetSessionVars().StmtCtx) {
 			if len(r.HighVal) != len(exprCols) {
 				return l.fullRange, nil
 			}
diff --git a/statistics/table.go b/statistics/table.go
index e0fabb74f8bd1..73c1d608fc403 100644
--- a/statistics/table.go
+++ b/statistics/table.go
@@ -33,8 +33,11 @@ import (
 	"github.com/pingcap/tidb/types"
 	"github.com/pingcap/tidb/util/chunk"
 	"github.com/pingcap/tidb/util/codec"
+	"github.com/pingcap/tidb/util/logutil"
 	"github.com/pingcap/tidb/util/ranger"
+	"github.com/pingcap/tidb/util/tracing"
 	"go.uber.org/atomic"
+	"go.uber.org/zap"
 )
 
 const (
@@ -331,17 +334,26 @@ func (t *Table) ColumnEqualRowCount(sc *stmtctx.StatementContext, value types.Da
 
 // GetRowCountByIntColumnRanges estimates the row count by a slice of IntColumnRange.
 func (coll *HistColl) GetRowCountByIntColumnRanges(sc *stmtctx.StatementContext, colID int64, intRanges []*ranger.Range) (float64, error) {
+	var result float64
 	c, ok := coll.Columns[colID]
 	if !ok || c.IsInvalid(sc, coll.Pseudo) {
 		if len(intRanges) == 0 {
 			return 0, nil
 		}
 		if intRanges[0].LowVal[0].Kind() == types.KindInt64 {
-			return getPseudoRowCountBySignedIntRanges(intRanges, float64(coll.Count)), nil
+			result = getPseudoRowCountBySignedIntRanges(intRanges, float64(coll.Count))
+		} else {
+			result = getPseudoRowCountByUnsignedIntRanges(intRanges, float64(coll.Count))
+		}
+		if sc.EnableOptimizerCETrace && ok {
+			CETraceRange(sc, coll.PhysicalID, []string{c.Info.Name.O}, intRanges, "Column Stats-Pseudo", uint64(result))
 		}
-		return getPseudoRowCountByUnsignedIntRanges(intRanges, float64(coll.Count)), nil
+		return result, nil
 	}
 	result, err := c.GetColumnRowCount(sc, intRanges, coll.Count, true)
+	if sc.EnableOptimizerCETrace {
+		CETraceRange(sc, coll.PhysicalID, []string{c.Info.Name.O}, intRanges, "Column Stats", uint64(result))
+	}
 	return result, errors.Trace(err)
 }
 
@@ -349,21 +361,38 @@ func (coll *HistColl) GetRowCountByIntColumnRanges(sc *stmtctx.StatementContext,
 func (coll *HistColl) GetRowCountByColumnRanges(sc *stmtctx.StatementContext, colID int64, colRanges []*ranger.Range) (float64, error) {
 	c, ok := coll.Columns[colID]
 	if !ok || c.IsInvalid(sc, coll.Pseudo) {
-		return GetPseudoRowCountByColumnRanges(sc, float64(coll.Count), colRanges, 0)
+		result, err := GetPseudoRowCountByColumnRanges(sc, float64(coll.Count), colRanges, 0)
+		if err == nil && sc.EnableOptimizerCETrace && ok {
+			CETraceRange(sc, coll.PhysicalID, []string{c.Info.Name.O}, colRanges, "Column Stats-Pseudo", uint64(result))
+		}
+		return result, err
 	}
 	result, err := c.GetColumnRowCount(sc, colRanges, coll.Count, false)
+	if sc.EnableOptimizerCETrace {
+		CETraceRange(sc, coll.PhysicalID, []string{c.Info.Name.O}, colRanges, "Column Stats", uint64(result))
+	}
 	return result, errors.Trace(err)
 }
 
 // GetRowCountByIndexRanges estimates the row count by a slice of Range.
 func (coll *HistColl) GetRowCountByIndexRanges(sc *stmtctx.StatementContext, idxID int64, indexRanges []*ranger.Range) (float64, error) {
-	idx := coll.Indices[idxID]
-	if idx == nil || idx.IsInvalid(coll.Pseudo) {
+	idx, ok := coll.Indices[idxID]
+	colNames := make([]string, 0, 8)
+	if ok {
+		for _, col := range idx.Info.Columns {
+			colNames = append(colNames, col.Name.O)
+		}
+	}
+	if !ok || idx.IsInvalid(coll.Pseudo) {
 		colsLen := -1
 		if idx != nil && idx.Info.Unique {
 			colsLen = len(idx.Info.Columns)
 		}
-		return getPseudoRowCountByIndexRanges(sc, indexRanges, float64(coll.Count), colsLen)
+		result, err := getPseudoRowCountByIndexRanges(sc, indexRanges, float64(coll.Count), colsLen)
+		if err == nil && sc.EnableOptimizerCETrace && ok {
+			CETraceRange(sc, coll.PhysicalID, colNames, indexRanges, "Index Stats-Pseudo", uint64(result))
+		}
+		return result, err
 	}
 	var result float64
 	var err error
@@ -372,9 +401,43 @@ func (coll *HistColl) GetRowCountByIndexRanges(sc *stmtctx.StatementContext, idx
 	} else {
 		result, err = idx.GetRowCount(sc, coll, indexRanges, coll.Count)
 	}
+	if sc.EnableOptimizerCETrace {
+		CETraceRange(sc, coll.PhysicalID, colNames, indexRanges, "Index Stats", uint64(result))
+	}
 	return result, errors.Trace(err)
 }
 
+// CETraceRange appends a list of ranges and related information into CE trace
+func CETraceRange(sc *stmtctx.StatementContext, tableID int64, colNames []string, ranges []*ranger.Range, tp string, rowCount uint64) {
+	allPoint := true
+	for _, ran := range ranges {
+		if !ran.IsPointNullable(sc) {
+			allPoint = false
+			break
+		}
+	}
+	if allPoint {
+		tp = tp + "-Point"
+	} else {
+		tp = tp + "-Range"
+	}
+	expr, err := ranger.RangesToString(sc, ranges, colNames)
+	if err != nil {
+		logutil.BgLogger().Debug("[OptimizerTrace] Failed to trace CE of ranges", zap.Error(err))
+	}
+	// We don't need to record meaningless expressions.
+	if expr == "" || expr == "true" || expr == "false" {
+		return
+	}
+	CERecord := tracing.CETraceRecord{
+		TableID:  tableID,
+		Type:     tp,
+		Expr:     expr,
+		RowCount: rowCount,
+	}
+	sc.OptimizerCETrace = append(sc.OptimizerCETrace, &CERecord)
+}
+
 // PseudoAvgCountPerValue gets a pseudo average count if histogram not exists.
 func (t *Table) PseudoAvgCountPerValue() float64 {
 	return float64(t.Count) / pseudoEqualRate
diff --git a/statistics/testdata/trace_suite_out.json b/statistics/testdata/trace_suite_out.json
index d45173d34d24c..9d13e199263a3 100644
--- a/statistics/testdata/trace_suite_out.json
+++ b/statistics/testdata/trace_suite_out.json
@@ -5,6 +5,41 @@
       {
         "Expr": "a > 0 and a < 2",
         "Trace": [
+          {
+            "TableID": 57,
+            "TableName": "",
+            "Type": "Column Stats-Point",
+            "Expr": "((a = 1))",
+            "RowCount": 4
+          },
+          {
+            "TableID": 57,
+            "TableName": "",
+            "Type": "Index Stats-Point",
+            "Expr": "((a = 1))",
+            "RowCount": 4
+          },
+          {
+            "TableID": 57,
+            "TableName": "",
+            "Type": "Column Stats-Range",
+            "Expr": "((a > 0 and a < 2))",
+            "RowCount": 4
+          },
+          {
+            "TableID": 57,
+            "TableName": "",
+            "Type": "Column Stats-Point",
+            "Expr": "((a = 1))",
+            "RowCount": 4
+          },
+          {
+            "TableID": 57,
+            "TableName": "",
+            "Type": "Index Stats-Point",
+            "Expr": "((a = 1))",
+            "RowCount": 4
+          },
           {
             "TableID": 57,
             "TableName": "",
@@ -24,6 +59,27 @@
       {
         "Expr": "a >= 1 and a < 10",
         "Trace": [
+          {
+            "TableID": 57,
+            "TableName": "",
+            "Type": "Index Stats-Range",
+            "Expr": "((a >= 1 and a < 10))",
+            "RowCount": 6
+          },
+          {
+            "TableID": 57,
+            "TableName": "",
+            "Type": "Column Stats-Range",
+            "Expr": "((a >= 1 and a < 10))",
+            "RowCount": 6
+          },
+          {
+            "TableID": 57,
+            "TableName": "",
+            "Type": "Index Stats-Range",
+            "Expr": "((a >= 1 and a < 10))",
+            "RowCount": 6
+          },
           {
             "TableID": 57,
             "TableName": "",
@@ -43,6 +99,20 @@
       {
         "Expr": "a < 3 or b < 4",
         "Trace": [
+          {
+            "TableID": 57,
+            "TableName": "",
+            "Type": "Column Stats-Range",
+            "Expr": "((a < 3))",
+            "RowCount": 6
+          },
+          {
+            "TableID": 57,
+            "TableName": "",
+            "Type": "Index Stats-Range",
+            "Expr": "((a < 3))",
+            "RowCount": 6
+          },
           {
             "TableID": 57,
             "TableName": "",
@@ -64,6 +134,13 @@
             "Expr": "`or`(`lt`(test.t.a, 3), `lt`(test.t.b, 4))",
             "RowCount": 6
           },
+          {
+            "TableID": 57,
+            "TableName": "",
+            "Type": "Column Stats-Range",
+            "Expr": "((b < 4))",
+            "RowCount": 6
+          },
           {
             "TableID": 57,
             "TableName": "",
@@ -99,18 +176,25 @@
             "Expr": "`or`(`lt`(test.t.a, 3), `lt`(test.t.b, 4))",
             "RowCount": 6
           },
+          {
+            "TableID": 57,
+            "TableName": "",
+            "Type": "Column Stats-Range",
+            "Expr": "((b < 4))",
+            "RowCount": 6
+          },
           {
             "TableID": 57,
             "TableName": "",
             "Type": "Table Stats-Expression-CNF",
-            "Expr": "`lt`(test.t.a, 3)",
+            "Expr": "`lt`(test.t.b, 4)",
             "RowCount": 6
           },
           {
             "TableID": 57,
             "TableName": "",
             "Type": "Table Stats-Expression-CNF",
-            "Expr": "`lt`(test.t.a, 3)",
+            "Expr": "`lt`(test.t.b, 4)",
             "RowCount": 6
           },
           {
@@ -120,18 +204,32 @@
             "Expr": "`or`(`lt`(test.t.a, 3), `lt`(test.t.b, 4))",
             "RowCount": 6
           },
+          {
+            "TableID": 57,
+            "TableName": "",
+            "Type": "Column Stats-Range",
+            "Expr": "((a < 3))",
+            "RowCount": 6
+          },
+          {
+            "TableID": 57,
+            "TableName": "",
+            "Type": "Index Stats-Range",
+            "Expr": "((a < 3))",
+            "RowCount": 6
+          },
           {
             "TableID": 57,
             "TableName": "",
             "Type": "Table Stats-Expression-CNF",
-            "Expr": "`lt`(test.t.b, 4)",
+            "Expr": "`lt`(test.t.a, 3)",
             "RowCount": 6
           },
           {
             "TableID": 57,
             "TableName": "",
             "Type": "Table Stats-Expression-CNF",
-            "Expr": "`lt`(test.t.b, 4)",
+            "Expr": "`lt`(test.t.a, 3)",
             "RowCount": 6
           },
           {
@@ -160,6 +258,34 @@
       {
         "Expr": "a = 1 and b = 2",
         "Trace": [
+          {
+            "TableID": 57,
+            "TableName": "",
+            "Type": "Index Stats-Point",
+            "Expr": "((a = 1) and (b = 2))",
+            "RowCount": 2
+          },
+          {
+            "TableID": 57,
+            "TableName": "",
+            "Type": "Column Stats-Point",
+            "Expr": "((a = 1))",
+            "RowCount": 4
+          },
+          {
+            "TableID": 57,
+            "TableName": "",
+            "Type": "Column Stats-Point",
+            "Expr": "((b = 2))",
+            "RowCount": 3
+          },
+          {
+            "TableID": 57,
+            "TableName": "",
+            "Type": "Index Stats-Point",
+            "Expr": "((a = 1) and (b = 2))",
+            "RowCount": 2
+          },
           {
             "TableID": 57,
             "TableName": "",
diff --git a/util/ranger/ranger.go b/util/ranger/ranger.go
index 0e39a228ca9a3..7d7cbc9b41a5b 100644
--- a/util/ranger/ranger.go
+++ b/util/ranger/ranger.go
@@ -17,6 +17,7 @@ package ranger
 import (
 	"bytes"
 	"math"
+	"regexp"
 	"sort"
 	"unicode/utf8"
 
@@ -25,10 +26,13 @@ import (
 	"github.com/pingcap/tidb/kv"
 	"github.com/pingcap/tidb/parser/ast"
 	"github.com/pingcap/tidb/parser/charset"
+	"github.com/pingcap/tidb/parser/format"
 	"github.com/pingcap/tidb/parser/mysql"
 	"github.com/pingcap/tidb/parser/terror"
 	"github.com/pingcap/tidb/sessionctx"
+	"github.com/pingcap/tidb/sessionctx/stmtctx"
 	"github.com/pingcap/tidb/types"
+	driver "github.com/pingcap/tidb/types/parser_driver"
 	"github.com/pingcap/tidb/util/codec"
 )
 
@@ -596,3 +600,154 @@ func DetachCondAndBuildRangeForPartition(sctx sessionctx.Context, conditions []e
 	}
 	return d.detachCondAndBuildRangeForCols()
 }
+
+// RangesToString print a list of Ranges into a string which can appear in an SQL as a condition.
+func RangesToString(sc *stmtctx.StatementContext, rans []*Range, colNames []string) (string, error) {
+	for _, ran := range rans {
+		if len(ran.LowVal) != len(ran.HighVal) {
+			return "", errors.New("range length mismatch")
+		}
+	}
+	var buffer bytes.Buffer
+	for i, ran := range rans {
+		buffer.WriteString("(")
+		for j := range ran.LowVal {
+			buffer.WriteString("(")
+
+			// The `Exclude` information is only useful for the last columns.
+			// If it's not the last column, it should always be false, which means it's inclusive.
+			lowExclude := false
+			if ran.LowExclude && j == len(ran.LowVal)-1 {
+				lowExclude = true
+			}
+			highExclude := false
+			if ran.HighExclude && j == len(ran.LowVal)-1 {
+				highExclude = true
+			}
+
+			// sanity check: only last column of the `Range` can be an interval
+			if j < len(ran.LowVal)-1 {
+				cmp, err := ran.LowVal[j].CompareDatum(sc, &ran.HighVal[j])
+				if err != nil {
+					return "", errors.New("comparing values error: " + err.Error())
+				}
+				if cmp != 0 {
+					return "", errors.New("unexpected form of range")
+				}
+			}
+
+			str, err := RangeSingleColToString(sc, ran.LowVal[j], ran.HighVal[j], lowExclude, highExclude, colNames[j])
+			if err != nil {
+				return "false", err
+			}
+			buffer.WriteString(str)
+			buffer.WriteString(")")
+			if j < len(ran.LowVal)-1 {
+				// Conditions on different columns of a range are implicitly connected with AND.
+				buffer.WriteString(" and ")
+			}
+		}
+		buffer.WriteString(")")
+		if i < len(rans)-1 {
+			// Conditions of different ranges are implicitly connected with OR.
+			buffer.WriteString(" or ")
+		}
+	}
+	result := buffer.String()
+
+	// Simplify some useless conditions.
+	if matched, err := regexp.MatchString(`^\(*true\)*$`, result); matched || (err != nil) {
+		return "true", nil
+	}
+	return result, nil
+}
+
+// RangeSingleColToString prints a single column of a Range into a string which can appear in an SQL as a condition.
+func RangeSingleColToString(sc *stmtctx.StatementContext, lowVal, highVal types.Datum, lowExclude, highExclude bool, colName string) (string, error) {
+	// case 1: low and high are both special values(null, min not null, max value)
+	lowKind := lowVal.Kind()
+	highKind := highVal.Kind()
+	if (lowKind == types.KindNull || lowKind == types.KindMinNotNull || lowKind == types.KindMaxValue) &&
+		(highKind == types.KindNull || highKind == types.KindMinNotNull || highKind == types.KindMaxValue) {
+		if lowKind == types.KindNull && highKind == types.KindNull && !lowExclude && !highExclude {
+			return colName + " is null", nil
+		}
+		if lowKind == types.KindNull && highKind == types.KindMaxValue && !lowExclude {
+			return "true", nil
+		}
+		if lowKind == types.KindMinNotNull && highKind == types.KindMaxValue {
+			return colName + " is not null", nil
+		}
+		return "false", nil
+	}
+
+	var buf bytes.Buffer
+	restoreCtx := format.NewRestoreCtx(format.DefaultRestoreFlags, &buf)
+
+	// case 2: low value and high value are the same, and low value and high value are both inclusive.
+	cmp, err := lowVal.CompareDatum(sc, &highVal)
+	if err != nil {
+		return "false", errors.Trace(err)
+	}
+	if cmp == 0 && !lowExclude && !highExclude && !lowVal.IsNull() {
+		buf.WriteString(colName)
+		buf.WriteString(" = ")
+		lowValExpr := driver.ValueExpr{Datum: lowVal}
+		err := lowValExpr.Restore(restoreCtx)
+		if err != nil {
+			return "false", errors.Trace(err)
+		}
+		return buf.String(), nil
+	}
+
+	// case 3: it's an interval.
+	useOR := false
+	noLowerPart := false
+
+	// Handle the low value part.
+	if lowKind == types.KindNull {
+		buf.WriteString(colName + " is null")
+		useOR = true
+	} else if lowKind == types.KindMinNotNull {
+		noLowerPart = true
+	} else {
+		buf.WriteString(colName)
+		if lowExclude {
+			buf.WriteString(" > ")
+		} else {
+			buf.WriteString(" >= ")
+		}
+		lowValExpr := driver.ValueExpr{Datum: lowVal}
+		err := lowValExpr.Restore(restoreCtx)
+		if err != nil {
+			return "false", errors.Trace(err)
+		}
+	}
+
+	if !noLowerPart {
+		if useOR {
+			buf.WriteString(" or ")
+		} else {
+			buf.WriteString(" and ")
+		}
+	}
+
+	// Handle the high value part
+	if highKind == types.KindMaxValue {
+		buf.WriteString("true")
+	} else {
+		buf.WriteString(colName)
+		if highExclude {
+			buf.WriteString(" < ")
+		} else {
+			buf.WriteString(" <= ")
+		}
+		highValExpr := driver.ValueExpr{Datum: highVal}
+		err := highValExpr.Restore(restoreCtx)
+		if err != nil {
+			return "false", errors.Trace(err)
+		}
+	}
+
+	return buf.String(), nil
+}
diff --git a/util/ranger/types.go b/util/ranger/types.go
index c950419cf217f..2e8cc1dc6120d 100644
--- a/util/ranger/types.go
+++ b/util/ranger/types.go
@@ -82,10 +82,10 @@ func (ran *Range) Clone() *Range {
 
 // IsPoint returns if the range is a point.
 func (ran *Range) IsPoint(sctx sessionctx.Context) bool {
-	return ran.isPoint(sctx, sctx.GetSessionVars().RegardNULLAsPoint)
+	return ran.isPoint(sctx.GetSessionVars().StmtCtx, sctx.GetSessionVars().RegardNULLAsPoint)
 }
 
-func (ran *Range) isPoint(sctx sessionctx.Context, regardNullAsPoint bool) bool {
+func (ran *Range) isPoint(stmtCtx *stmtctx.StatementContext, regardNullAsPoint bool) bool {
 	if len(ran.LowVal) != len(ran.HighVal) {
 		return false
 	}
@@ -95,7 +95,7 @@ func (ran *Range) isPoint(sctx sessionctx.Context, regardNullAsPoint bool) bool
 		if a.Kind() == types.KindMinNotNull || b.Kind() == types.KindMaxValue {
 			return false
 		}
-		cmp, err := a.CompareDatum(sctx.GetSessionVars().StmtCtx, &b)
+		cmp, err := a.CompareDatum(stmtCtx, &b)
 		if err != nil {
 			return false
 		}
@@ -114,12 +114,13 @@ func (ran *Range) isPoint(sctx sessionctx.Context, regardNullAsPoint bool) bool
 
 // IsPointNonNullable returns if the range is a point without NULL.
 func (ran *Range) IsPointNonNullable(sctx sessionctx.Context) bool {
-	return ran.isPoint(sctx, false)
+	return ran.isPoint(sctx.GetSessionVars().StmtCtx, false)
 }
 
 // IsPointNullable returns if the range is a point.
-func (ran *Range) IsPointNullable(sctx sessionctx.Context) bool {
-	return ran.isPoint(sctx, true)
+// TODO: unify the parameter type with IsPointNullable and IsPoint
+func (ran *Range) IsPointNullable(stmtCtx *stmtctx.StatementContext) bool {
+	return ran.isPoint(stmtCtx, true)
 }
 
 // IsFullRange check if the range is full scan range