opt: avoid estimating row count = 0

This commit improves our statistics estimates so that we never estimate zero rows unless the row count is provably zero (e.g., SELECT ... WHERE false). We want to avoid estimating zero rows since the stats may be stale, and we can end up with weird and inefficient plans if we estimate zero rows. Therefore, this commit changes the logic in the statisticsBuilder so that a row count of 0 is replaced with 1, unless that would be inconsistent with the cardinality. This commit also updates all estimates for distinct count and null count to ensure that they are never larger than the row count. We also ensure that there is at least one distinct or null value if row count > 0. Fixes cockroachdb#32578 Release note: None
rytaft · May 22, 2019 · 13250f3 · 13250f3
1 parent 51a5dc9
commit 13250f3
Show file tree

Hide file tree

Showing 8 changed files with 109 additions and 48 deletions.
diff --git a/pkg/sql/opt/memo/statistics_builder.go b/pkg/sql/opt/memo/statistics_builder.go
@@ -550,6 +550,7 @@ func (sb *statisticsBuilder) colStatScan(colSet opt.ColSet, scan *ScanExpr) *pro
 		colStat.NullCount = 0
 	}
 
+	sb.finalizeFromRowCount(colStat, s.RowCount)
 	return colStat
 }
 
@@ -574,7 +575,9 @@ func (sb *statisticsBuilder) colStatVirtualScan(
 	colSet opt.ColSet, scan *VirtualScanExpr,
 ) *props.ColumnStatistic {
 	s := &scan.Relational().Stats
-	return sb.copyColStat(colSet, s, sb.colStatTable(scan.Table, colSet))
+	colStat := sb.copyColStat(colSet, s, sb.colStatTable(scan.Table, colSet))
+	sb.finalizeFromRowCount(colStat, s.RowCount)
+	return colStat
 }
 
 // +--------+
@@ -655,6 +658,7 @@ func (sb *statisticsBuilder) colStatSelect(
 	if colSet.SubsetOf(relProps.NotNullCols) {
 		colStat.NullCount = 0
 	}
+	sb.finalizeFromRowCount(colStat, s.RowCount)
 	return colStat
 }
 
@@ -739,6 +743,7 @@ func (sb *statisticsBuilder) colStatProject(
 	if colSet.SubsetOf(relProps.NotNullCols) {
 		colStat.NullCount = 0
 	}
+	sb.finalizeFromRowCount(colStat, s.RowCount)
 	return colStat
 }
 
@@ -962,6 +967,7 @@ func (sb *statisticsBuilder) colStatJoin(colSet opt.ColSet, join RelExpr) *props
 		// Column stats come from left side of join.
 		colStat := sb.copyColStat(colSet, s, sb.colStatFromJoinLeft(colSet, join))
 		colStat.ApplySelectivity(s.Selectivity, leftProps.Stats.RowCount)
+		sb.finalizeFromRowCount(colStat, s.RowCount)
 		return colStat
 
 	default:
@@ -1048,15 +1054,10 @@ func (sb *statisticsBuilder) colStatJoin(colSet opt.ColSet, join RelExpr) *props
 			)
 		}
 
-		// The distinct count should be no larger than the row count.
-		if colStat.DistinctCount > s.RowCount {
-			colStat.DistinctCount = s.RowCount
-		}
-		// Similarly, the null count should be no larger than RowCount.
-		colStat.NullCount = min(s.RowCount, colStat.NullCount)
 		if colSet.SubsetOf(relProps.NotNullCols) {
 			colStat.NullCount = 0
 		}
+		sb.finalizeFromRowCount(colStat, s.RowCount)
 		return colStat
 	}
 }
@@ -1244,15 +1245,10 @@ func (sb *statisticsBuilder) colStatIndexJoin(
 		colStat.NullCount = inputStats.RowCount * (f1 + f2 - f1*f2)
 	}
 
-	// The distinct count should be no larger than the row count.
-	if colStat.DistinctCount > s.RowCount {
-		colStat.DistinctCount = s.RowCount
-	}
-	// Similarly, the null count should be no larger than RowCount.
-	colStat.NullCount = min(s.RowCount, colStat.NullCount)
 	if colSet.SubsetOf(relProps.NotNullCols) {
 		colStat.NullCount = 0
 	}
+	sb.finalizeFromRowCount(colStat, s.RowCount)
 	return colStat
 }
 
@@ -1420,11 +1416,11 @@ func (sb *statisticsBuilder) colStatGroupBy(
 		inputRowCount := sb.statsFromChild(groupNode, 0 /* childIdx */).RowCount
 		colStat.NullCount = ((colStat.DistinctCount + 1) / inputRowCount) * inputColStat.NullCount
 	}
-	colStat.NullCount = min(s.RowCount, colStat.NullCount)
 
 	if colSet.SubsetOf(relProps.NotNullCols) {
 		colStat.NullCount = 0
 	}
+	sb.finalizeFromRowCount(colStat, s.RowCount)
 	return colStat
 }
 
@@ -1521,6 +1517,7 @@ func (sb *statisticsBuilder) colStatSetNodeImpl(
 	if outputCols.SubsetOf(relProps.NotNullCols) {
 		colStat.NullCount = 0
 	}
+	sb.finalizeFromRowCount(colStat, s.RowCount)
 	return colStat
 }
 
@@ -1585,6 +1582,7 @@ func (sb *statisticsBuilder) colStatValues(
 	colStat, _ := s.ColStats.Add(colSet)
 	colStat.DistinctCount = float64(len(distinct))
 	colStat.NullCount = float64(nullCount)
+	sb.finalizeFromRowCount(colStat, s.RowCount)
 	return colStat
 }
 
@@ -1629,6 +1627,7 @@ func (sb *statisticsBuilder) colStatLimit(
 	if colSet.SubsetOf(relProps.NotNullCols) {
 		colStat.NullCount = 0
 	}
+	sb.finalizeFromRowCount(colStat, s.RowCount)
 	return colStat
 }
 
@@ -1675,6 +1674,7 @@ func (sb *statisticsBuilder) colStatOffset(
 	if colSet.SubsetOf(relProps.NotNullCols) {
 		colStat.NullCount = 0
 	}
+	sb.finalizeFromRowCount(colStat, s.RowCount)
 	return colStat
 }
 
@@ -1703,6 +1703,7 @@ func (sb *statisticsBuilder) colStatMax1Row(
 	if colSet.SubsetOf(max1Row.Relational().NotNullCols) {
 		colStat.NullCount = 0
 	}
+	sb.finalizeFromRowCount(colStat, s.RowCount)
 	return colStat
 }
 
@@ -1753,6 +1754,7 @@ func (sb *statisticsBuilder) colStatOrdinality(
 	if colSet.SubsetOf(relProps.NotNullCols) {
 		colStat.NullCount = 0
 	}
+	sb.finalizeFromRowCount(colStat, s.RowCount)
 	return colStat
 }
 
@@ -1817,6 +1819,7 @@ func (sb *statisticsBuilder) colStatWindow(
 	if colSet.SubsetOf(relProps.NotNullCols) {
 		colStat.NullCount = 0
 	}
+	sb.finalizeFromRowCount(colStat, s.RowCount)
 	return colStat
 }
 
@@ -1944,13 +1947,10 @@ func (sb *statisticsBuilder) colStatProjectSet(
 		colStat.NullCount = s.RowCount * (f1 + f2 - f1*f2)
 	}
 
-	// The distinct count and null count should be no larger than the row count.
-	colStat.DistinctCount = min(s.RowCount, colStat.DistinctCount)
-	colStat.NullCount = min(s.RowCount, colStat.NullCount)
-
 	if colSet.SubsetOf(projectSet.Relational().NotNullCols) {
 		colStat.NullCount = 0
 	}
+	sb.finalizeFromRowCount(colStat, s.RowCount)
 	return colStat
 }
 
@@ -1986,6 +1986,7 @@ func (sb *statisticsBuilder) colStatMutation(
 	colStat, _ := s.ColStats.Add(colSet)
 	colStat.DistinctCount = inColStat.DistinctCount
 	colStat.NullCount = inColStat.NullCount
+	sb.finalizeFromRowCount(colStat, s.RowCount)
 	return colStat
 }
 
@@ -2007,6 +2008,7 @@ func (sb *statisticsBuilder) colStatSequenceSelect(
 	colStat, _ := s.ColStats.Add(colSet)
 	colStat.DistinctCount = 1
 	colStat.NullCount = 0
+	sb.finalizeFromRowCount(colStat, s.RowCount)
 	return colStat
 }
 
@@ -2100,21 +2102,44 @@ func translateColSet(colSetIn opt.ColSet, from opt.ColList, to opt.ColList) opt.
 
 func (sb *statisticsBuilder) finalizeFromCardinality(relProps *props.Relational) {
 	s := &relProps.Stats
+
+	// We don't ever want row count = 0 unless the cardinality is zero.
+	// This is because the stats may be stale, and we can end up with weird and
+	// inefficient plans if we estimate 0 rows.
+	//
+	// Increment the row count here if necessary, but it may be reduced below if
+	// the cardinality is 0.
+	if s.RowCount <= 0 {
+		s.RowCount = 1
+	}
+
 	// The row count should be between the min and max cardinality.
 	if s.RowCount > float64(relProps.Cardinality.Max) && relProps.Cardinality.Max != math.MaxUint32 {
 		s.RowCount = float64(relProps.Cardinality.Max)
 	} else if s.RowCount < float64(relProps.Cardinality.Min) {
 		s.RowCount = float64(relProps.Cardinality.Min)
 	}
 
-	// The distinct and null counts should be no larger than the row count.
 	for i, n := 0, s.ColStats.Count(); i < n; i++ {
 		colStat := s.ColStats.Get(i)
-		colStat.DistinctCount = min(colStat.DistinctCount, s.RowCount)
-		colStat.NullCount = min(colStat.NullCount, s.RowCount)
+		sb.finalizeFromRowCount(colStat, s.RowCount)
 	}
 }
 
+func (sb *statisticsBuilder) finalizeFromRowCount(
+	colStat *props.ColumnStatistic, rowCount float64,
+) {
+	// We should always have at least one distinct or null value if
+	// row count > 0.
+	if rowCount > 0 && colStat.DistinctCount == 0 && colStat.NullCount == 0 {
+		colStat.DistinctCount = 1
+	}
+
+	// The distinct and null counts should be no larger than the row count.
+	colStat.DistinctCount = min(colStat.DistinctCount, rowCount)
+	colStat.NullCount = min(colStat.NullCount, rowCount)
+}
+
 func min(a float64, b float64) float64 {
 	if a < b {
 		return a

diff --git a/pkg/sql/opt/memo/testdata/memo b/pkg/sql/opt/memo/testdata/memo
@@ -357,7 +357,7 @@ memo (optimized, ~5KB, required=[presentation: field:3])
  ├── G1: (distinct-on G2 G3 cols=(3))
  │    └── [presentation: field:3]
  │         ├── best: (distinct-on G2 G3 cols=(3))
- │         └── cost: 0.04
+ │         └── cost: 0.05
  ├── G2: (explain G4 [presentation: k:1])
  │    └── []
  │         ├── best: (explain G4="[presentation: k:1]" [presentation: k:1])
@@ -379,7 +379,7 @@ memo (optimized, ~2KB, required=[presentation: tag:4])
  ├── G1: (distinct-on G2 G3 cols=(4))
  │    └── [presentation: tag:4]
  │         ├── best: (distinct-on G2 G3 cols=(4))
- │         └── cost: 0.02
+ │         └── cost: 0.03
  ├── G2: (show-trace-for-session &{TRACE false [1 2 3 4 5 6 7]})
  │    └── []
  │         ├── best: (show-trace-for-session &{TRACE false [1 2 3 4 5 6 7]})

diff --git a/pkg/sql/opt/memo/testdata/stats/groupby b/pkg/sql/opt/memo/testdata/stats/groupby
@@ -479,12 +479,12 @@ GROUP BY q.b
 project
  ├── columns: "?column?":4(int!null)
  ├── cardinality: [0 - 3]
- ├── stats: [rows=0]
+ ├── stats: [rows=1]
  ├── fd: ()-->(4)
  ├── select
  │    ├── columns: column2:2(int) bool_or:3(bool!null)
  │    ├── cardinality: [0 - 3]
- │    ├── stats: [rows=0, distinct(3)=0, null(3)=0]
+ │    ├── stats: [rows=1, distinct(3)=1, null(3)=0]
  │    ├── key: (2)
  │    ├── fd: ()-->(3)
  │    ├── group-by

diff --git a/pkg/sql/opt/memo/testdata/stats/limit b/pkg/sql/opt/memo/testdata/stats/limit
@@ -209,3 +209,39 @@ limit
  │    └── filters
  │         └── s = 'foo' [type=bool, outer=(3), constraints=(/3: [/'foo' - /'foo']; tight), fd=()-->(3)]
  └── const: 5 [type=int]
+
+exec-ddl
+CREATE TABLE b (x int)
+----
+TABLE b
+ ├── x int
+ ├── rowid int not null (hidden)
+ └── INDEX primary
+      └── rowid int not null (hidden)
+
+# Regression test for #32578. Ensure that we don't estimate 0 rows for the
+# offset.
+opt colstat=1
+SELECT * FROM b ORDER BY x LIMIT 1 OFFSET 9999
+----
+limit
+ ├── columns: x:1(int)
+ ├── internal-ordering: +1
+ ├── cardinality: [0 - 1]
+ ├── stats: [rows=1, distinct(1)=1, null(1)=0]
+ ├── key: ()
+ ├── fd: ()-->(1)
+ ├── offset
+ │    ├── columns: x:1(int)
+ │    ├── internal-ordering: +1
+ │    ├── stats: [rows=1, distinct(1)=1, null(1)=0]
+ │    ├── ordering: +1
+ │    ├── sort
+ │    │    ├── columns: x:1(int)
+ │    │    ├── stats: [rows=1000, distinct(1)=100, null(1)=10]
+ │    │    ├── ordering: +1
+ │    │    └── scan b
+ │    │         ├── columns: x:1(int)
+ │    │         └── stats: [rows=1000, distinct(1)=100, null(1)=10]
+ │    └── const: 9999 [type=int]
+ └── const: 1 [type=int]
diff --git a/pkg/sql/opt/memo/testdata/stats/ordinality b/pkg/sql/opt/memo/testdata/stats/ordinality
@@ -34,12 +34,12 @@ select
  ├── fd: (1)-->(2,3), (3)-->(1,2)
  ├── ordinality
  │    ├── columns: x:1(int!null) y:2(int) ordinality:3(int!null)
- │    ├── stats: [rows=4000, distinct(1)=5000, null(1)=0, distinct(3)=4000, null(3)=0]
+ │    ├── stats: [rows=4000, distinct(1)=4000, null(1)=0, distinct(3)=4000, null(3)=0]
  │    ├── key: (1)
  │    ├── fd: (1)-->(2,3), (3)-->(1,2)
  │    └── scan a
  │         ├── columns: x:1(int!null) y:2(int)
- │         ├── stats: [rows=4000, distinct(1)=5000, null(1)=0]
+ │         ├── stats: [rows=4000, distinct(1)=4000, null(1)=0]
  │         ├── key: (1)
  │         └── fd: (1)-->(2)
  └── filters
@@ -55,12 +55,12 @@ select
  ├── fd: (1)-->(2,3), (3)-->(1,2)
  ├── ordinality
  │    ├── columns: x:1(int!null) y:2(int) ordinality:3(int!null)
- │    ├── stats: [rows=4000, distinct(1)=5000, null(1)=0, distinct(2)=400, null(2)=0, distinct(3)=4000, null(3)=0]
+ │    ├── stats: [rows=4000, distinct(1)=4000, null(1)=0, distinct(2)=400, null(2)=0, distinct(3)=4000, null(3)=0]
  │    ├── key: (1)
  │    ├── fd: (1)-->(2,3), (3)-->(1,2)
  │    └── scan a
  │         ├── columns: x:1(int!null) y:2(int)
- │         ├── stats: [rows=4000, distinct(1)=5000, null(1)=0, distinct(2)=400, null(2)=0]
+ │         ├── stats: [rows=4000, distinct(1)=4000, null(1)=0, distinct(2)=400, null(2)=0]
  │         ├── key: (1)
  │         └── fd: (1)-->(2)
  └── filters
@@ -96,12 +96,12 @@ project
       ├── fd: (1)-->(3), (3)-->(1)
       ├── ordinality
       │    ├── columns: x:1(int!null) ordinality:3(int!null)
-      │    ├── stats: [rows=4000, distinct(1)=5000, null(1)=0, distinct(3)=4000, null(3)=0]
+      │    ├── stats: [rows=4000, distinct(1)=4000, null(1)=0, distinct(3)=4000, null(3)=0]
       │    ├── key: (1)
       │    ├── fd: (1)-->(3), (3)-->(1)
       │    └── scan a
       │         ├── columns: x:1(int!null)
-      │         ├── stats: [rows=4000, distinct(1)=5000, null(1)=0]
+      │         ├── stats: [rows=4000, distinct(1)=4000, null(1)=0]
       │         └── key: (1)
       └── filters
            └── (ordinality > 0) AND (ordinality <= 10) [type=bool, outer=(3), constraints=(/3: [/1 - /10]; tight)]
@@ -118,12 +118,12 @@ select
  ├── fd: ()-->(1-3)
  ├── ordinality
  │    ├── columns: x:1(int!null) y:2(int) ordinality:3(int!null)
- │    ├── stats: [rows=4000, distinct(1)=5000, null(1)=0, distinct(3)=4000, null(3)=0]
+ │    ├── stats: [rows=4000, distinct(1)=4000, null(1)=0, distinct(3)=4000, null(3)=0]
  │    ├── key: (1)
  │    ├── fd: (1)-->(2,3), (3)-->(1,2)
  │    └── scan a
  │         ├── columns: x:1(int!null) y:2(int)
- │         ├── stats: [rows=4000, distinct(1)=5000, null(1)=0]
+ │         ├── stats: [rows=4000, distinct(1)=4000, null(1)=0]
  │         ├── key: (1)
  │         └── fd: (1)-->(2)
  └── filters