Skip to content

Commit

Permalink
opt: avoid estimating row count = 0
Browse files Browse the repository at this point in the history
This commit improves our statistics estimates so that we never estimate
zero rows unless the row count is provably zero (e.g., SELECT ... WHERE false).
We want to avoid estimating zero rows since the stats may be stale, and
we can end up with weird and inefficient plans if we estimate zero rows.
Therefore, this commit changes the logic in the statisticsBuilder so that
a row count of 0 is replaced with 1, unless that would be inconsistent with
the cardinality.

This commit also updates all estimates for distinct count and null count
to ensure that they are never larger than the row count. We also ensure
that there is at least one distinct or null value if row count > 0.

Fixes cockroachdb#32578

Release note: None
  • Loading branch information
rytaft committed May 22, 2019
1 parent 51a5dc9 commit 13250f3
Show file tree
Hide file tree
Showing 8 changed files with 109 additions and 48 deletions.
67 changes: 46 additions & 21 deletions pkg/sql/opt/memo/statistics_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -550,6 +550,7 @@ func (sb *statisticsBuilder) colStatScan(colSet opt.ColSet, scan *ScanExpr) *pro
colStat.NullCount = 0
}

sb.finalizeFromRowCount(colStat, s.RowCount)
return colStat
}

Expand All @@ -574,7 +575,9 @@ func (sb *statisticsBuilder) colStatVirtualScan(
colSet opt.ColSet, scan *VirtualScanExpr,
) *props.ColumnStatistic {
s := &scan.Relational().Stats
return sb.copyColStat(colSet, s, sb.colStatTable(scan.Table, colSet))
colStat := sb.copyColStat(colSet, s, sb.colStatTable(scan.Table, colSet))
sb.finalizeFromRowCount(colStat, s.RowCount)
return colStat
}

// +--------+
Expand Down Expand Up @@ -655,6 +658,7 @@ func (sb *statisticsBuilder) colStatSelect(
if colSet.SubsetOf(relProps.NotNullCols) {
colStat.NullCount = 0
}
sb.finalizeFromRowCount(colStat, s.RowCount)
return colStat
}

Expand Down Expand Up @@ -739,6 +743,7 @@ func (sb *statisticsBuilder) colStatProject(
if colSet.SubsetOf(relProps.NotNullCols) {
colStat.NullCount = 0
}
sb.finalizeFromRowCount(colStat, s.RowCount)
return colStat
}

Expand Down Expand Up @@ -962,6 +967,7 @@ func (sb *statisticsBuilder) colStatJoin(colSet opt.ColSet, join RelExpr) *props
// Column stats come from left side of join.
colStat := sb.copyColStat(colSet, s, sb.colStatFromJoinLeft(colSet, join))
colStat.ApplySelectivity(s.Selectivity, leftProps.Stats.RowCount)
sb.finalizeFromRowCount(colStat, s.RowCount)
return colStat

default:
Expand Down Expand Up @@ -1048,15 +1054,10 @@ func (sb *statisticsBuilder) colStatJoin(colSet opt.ColSet, join RelExpr) *props
)
}

// The distinct count should be no larger than the row count.
if colStat.DistinctCount > s.RowCount {
colStat.DistinctCount = s.RowCount
}
// Similarly, the null count should be no larger than RowCount.
colStat.NullCount = min(s.RowCount, colStat.NullCount)
if colSet.SubsetOf(relProps.NotNullCols) {
colStat.NullCount = 0
}
sb.finalizeFromRowCount(colStat, s.RowCount)
return colStat
}
}
Expand Down Expand Up @@ -1244,15 +1245,10 @@ func (sb *statisticsBuilder) colStatIndexJoin(
colStat.NullCount = inputStats.RowCount * (f1 + f2 - f1*f2)
}

// The distinct count should be no larger than the row count.
if colStat.DistinctCount > s.RowCount {
colStat.DistinctCount = s.RowCount
}
// Similarly, the null count should be no larger than RowCount.
colStat.NullCount = min(s.RowCount, colStat.NullCount)
if colSet.SubsetOf(relProps.NotNullCols) {
colStat.NullCount = 0
}
sb.finalizeFromRowCount(colStat, s.RowCount)
return colStat
}

Expand Down Expand Up @@ -1420,11 +1416,11 @@ func (sb *statisticsBuilder) colStatGroupBy(
inputRowCount := sb.statsFromChild(groupNode, 0 /* childIdx */).RowCount
colStat.NullCount = ((colStat.DistinctCount + 1) / inputRowCount) * inputColStat.NullCount
}
colStat.NullCount = min(s.RowCount, colStat.NullCount)

if colSet.SubsetOf(relProps.NotNullCols) {
colStat.NullCount = 0
}
sb.finalizeFromRowCount(colStat, s.RowCount)
return colStat
}

Expand Down Expand Up @@ -1521,6 +1517,7 @@ func (sb *statisticsBuilder) colStatSetNodeImpl(
if outputCols.SubsetOf(relProps.NotNullCols) {
colStat.NullCount = 0
}
sb.finalizeFromRowCount(colStat, s.RowCount)
return colStat
}

Expand Down Expand Up @@ -1585,6 +1582,7 @@ func (sb *statisticsBuilder) colStatValues(
colStat, _ := s.ColStats.Add(colSet)
colStat.DistinctCount = float64(len(distinct))
colStat.NullCount = float64(nullCount)
sb.finalizeFromRowCount(colStat, s.RowCount)
return colStat
}

Expand Down Expand Up @@ -1629,6 +1627,7 @@ func (sb *statisticsBuilder) colStatLimit(
if colSet.SubsetOf(relProps.NotNullCols) {
colStat.NullCount = 0
}
sb.finalizeFromRowCount(colStat, s.RowCount)
return colStat
}

Expand Down Expand Up @@ -1675,6 +1674,7 @@ func (sb *statisticsBuilder) colStatOffset(
if colSet.SubsetOf(relProps.NotNullCols) {
colStat.NullCount = 0
}
sb.finalizeFromRowCount(colStat, s.RowCount)
return colStat
}

Expand Down Expand Up @@ -1703,6 +1703,7 @@ func (sb *statisticsBuilder) colStatMax1Row(
if colSet.SubsetOf(max1Row.Relational().NotNullCols) {
colStat.NullCount = 0
}
sb.finalizeFromRowCount(colStat, s.RowCount)
return colStat
}

Expand Down Expand Up @@ -1753,6 +1754,7 @@ func (sb *statisticsBuilder) colStatOrdinality(
if colSet.SubsetOf(relProps.NotNullCols) {
colStat.NullCount = 0
}
sb.finalizeFromRowCount(colStat, s.RowCount)
return colStat
}

Expand Down Expand Up @@ -1817,6 +1819,7 @@ func (sb *statisticsBuilder) colStatWindow(
if colSet.SubsetOf(relProps.NotNullCols) {
colStat.NullCount = 0
}
sb.finalizeFromRowCount(colStat, s.RowCount)
return colStat
}

Expand Down Expand Up @@ -1944,13 +1947,10 @@ func (sb *statisticsBuilder) colStatProjectSet(
colStat.NullCount = s.RowCount * (f1 + f2 - f1*f2)
}

// The distinct count and null count should be no larger than the row count.
colStat.DistinctCount = min(s.RowCount, colStat.DistinctCount)
colStat.NullCount = min(s.RowCount, colStat.NullCount)

if colSet.SubsetOf(projectSet.Relational().NotNullCols) {
colStat.NullCount = 0
}
sb.finalizeFromRowCount(colStat, s.RowCount)
return colStat
}

Expand Down Expand Up @@ -1986,6 +1986,7 @@ func (sb *statisticsBuilder) colStatMutation(
colStat, _ := s.ColStats.Add(colSet)
colStat.DistinctCount = inColStat.DistinctCount
colStat.NullCount = inColStat.NullCount
sb.finalizeFromRowCount(colStat, s.RowCount)
return colStat
}

Expand All @@ -2007,6 +2008,7 @@ func (sb *statisticsBuilder) colStatSequenceSelect(
colStat, _ := s.ColStats.Add(colSet)
colStat.DistinctCount = 1
colStat.NullCount = 0
sb.finalizeFromRowCount(colStat, s.RowCount)
return colStat
}

Expand Down Expand Up @@ -2100,21 +2102,44 @@ func translateColSet(colSetIn opt.ColSet, from opt.ColList, to opt.ColList) opt.

func (sb *statisticsBuilder) finalizeFromCardinality(relProps *props.Relational) {
s := &relProps.Stats

// We don't ever want row count = 0 unless the cardinality is zero.
// This is because the stats may be stale, and we can end up with weird and
// inefficient plans if we estimate 0 rows.
//
// Increment the row count here if necessary, but it may be reduced below if
// the cardinality is 0.
if s.RowCount <= 0 {
s.RowCount = 1
}

// The row count should be between the min and max cardinality.
if s.RowCount > float64(relProps.Cardinality.Max) && relProps.Cardinality.Max != math.MaxUint32 {
s.RowCount = float64(relProps.Cardinality.Max)
} else if s.RowCount < float64(relProps.Cardinality.Min) {
s.RowCount = float64(relProps.Cardinality.Min)
}

// The distinct and null counts should be no larger than the row count.
for i, n := 0, s.ColStats.Count(); i < n; i++ {
colStat := s.ColStats.Get(i)
colStat.DistinctCount = min(colStat.DistinctCount, s.RowCount)
colStat.NullCount = min(colStat.NullCount, s.RowCount)
sb.finalizeFromRowCount(colStat, s.RowCount)
}
}

func (sb *statisticsBuilder) finalizeFromRowCount(
colStat *props.ColumnStatistic, rowCount float64,
) {
// We should always have at least one distinct or null value if
// row count > 0.
if rowCount > 0 && colStat.DistinctCount == 0 && colStat.NullCount == 0 {
colStat.DistinctCount = 1
}

// The distinct and null counts should be no larger than the row count.
colStat.DistinctCount = min(colStat.DistinctCount, rowCount)
colStat.NullCount = min(colStat.NullCount, rowCount)
}

func min(a float64, b float64) float64 {
if a < b {
return a
Expand Down
4 changes: 2 additions & 2 deletions pkg/sql/opt/memo/testdata/memo
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ memo (optimized, ~5KB, required=[presentation: field:3])
├── G1: (distinct-on G2 G3 cols=(3))
│ └── [presentation: field:3]
│ ├── best: (distinct-on G2 G3 cols=(3))
│ └── cost: 0.04
│ └── cost: 0.05
├── G2: (explain G4 [presentation: k:1])
│ └── []
│ ├── best: (explain G4="[presentation: k:1]" [presentation: k:1])
Expand All @@ -379,7 +379,7 @@ memo (optimized, ~2KB, required=[presentation: tag:4])
├── G1: (distinct-on G2 G3 cols=(4))
│ └── [presentation: tag:4]
│ ├── best: (distinct-on G2 G3 cols=(4))
│ └── cost: 0.02
│ └── cost: 0.03
├── G2: (show-trace-for-session &{TRACE false [1 2 3 4 5 6 7]})
│ └── []
│ ├── best: (show-trace-for-session &{TRACE false [1 2 3 4 5 6 7]})
Expand Down
4 changes: 2 additions & 2 deletions pkg/sql/opt/memo/testdata/stats/groupby
Original file line number Diff line number Diff line change
Expand Up @@ -479,12 +479,12 @@ GROUP BY q.b
project
├── columns: "?column?":4(int!null)
├── cardinality: [0 - 3]
├── stats: [rows=0]
├── stats: [rows=1]
├── fd: ()-->(4)
├── select
│ ├── columns: column2:2(int) bool_or:3(bool!null)
│ ├── cardinality: [0 - 3]
│ ├── stats: [rows=0, distinct(3)=0, null(3)=0]
│ ├── stats: [rows=1, distinct(3)=1, null(3)=0]
│ ├── key: (2)
│ ├── fd: ()-->(3)
│ ├── group-by
Expand Down
36 changes: 36 additions & 0 deletions pkg/sql/opt/memo/testdata/stats/limit
Original file line number Diff line number Diff line change
Expand Up @@ -209,3 +209,39 @@ limit
│ └── filters
│ └── s = 'foo' [type=bool, outer=(3), constraints=(/3: [/'foo' - /'foo']; tight), fd=()-->(3)]
└── const: 5 [type=int]

exec-ddl
CREATE TABLE b (x int)
----
TABLE b
├── x int
├── rowid int not null (hidden)
└── INDEX primary
└── rowid int not null (hidden)

# Regression test for #32578. Ensure that we don't estimate 0 rows for the
# offset.
opt colstat=1
SELECT * FROM b ORDER BY x LIMIT 1 OFFSET 9999
----
limit
├── columns: x:1(int)
├── internal-ordering: +1
├── cardinality: [0 - 1]
├── stats: [rows=1, distinct(1)=1, null(1)=0]
├── key: ()
├── fd: ()-->(1)
├── offset
│ ├── columns: x:1(int)
│ ├── internal-ordering: +1
│ ├── stats: [rows=1, distinct(1)=1, null(1)=0]
│ ├── ordering: +1
│ ├── sort
│ │ ├── columns: x:1(int)
│ │ ├── stats: [rows=1000, distinct(1)=100, null(1)=10]
│ │ ├── ordering: +1
│ │ └── scan b
│ │ ├── columns: x:1(int)
│ │ └── stats: [rows=1000, distinct(1)=100, null(1)=10]
│ └── const: 9999 [type=int]
└── const: 1 [type=int]
16 changes: 8 additions & 8 deletions pkg/sql/opt/memo/testdata/stats/ordinality
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@ select
├── fd: (1)-->(2,3), (3)-->(1,2)
├── ordinality
│ ├── columns: x:1(int!null) y:2(int) ordinality:3(int!null)
│ ├── stats: [rows=4000, distinct(1)=5000, null(1)=0, distinct(3)=4000, null(3)=0]
│ ├── stats: [rows=4000, distinct(1)=4000, null(1)=0, distinct(3)=4000, null(3)=0]
│ ├── key: (1)
│ ├── fd: (1)-->(2,3), (3)-->(1,2)
│ └── scan a
│ ├── columns: x:1(int!null) y:2(int)
│ ├── stats: [rows=4000, distinct(1)=5000, null(1)=0]
│ ├── stats: [rows=4000, distinct(1)=4000, null(1)=0]
│ ├── key: (1)
│ └── fd: (1)-->(2)
└── filters
Expand All @@ -55,12 +55,12 @@ select
├── fd: (1)-->(2,3), (3)-->(1,2)
├── ordinality
│ ├── columns: x:1(int!null) y:2(int) ordinality:3(int!null)
│ ├── stats: [rows=4000, distinct(1)=5000, null(1)=0, distinct(2)=400, null(2)=0, distinct(3)=4000, null(3)=0]
│ ├── stats: [rows=4000, distinct(1)=4000, null(1)=0, distinct(2)=400, null(2)=0, distinct(3)=4000, null(3)=0]
│ ├── key: (1)
│ ├── fd: (1)-->(2,3), (3)-->(1,2)
│ └── scan a
│ ├── columns: x:1(int!null) y:2(int)
│ ├── stats: [rows=4000, distinct(1)=5000, null(1)=0, distinct(2)=400, null(2)=0]
│ ├── stats: [rows=4000, distinct(1)=4000, null(1)=0, distinct(2)=400, null(2)=0]
│ ├── key: (1)
│ └── fd: (1)-->(2)
└── filters
Expand Down Expand Up @@ -96,12 +96,12 @@ project
├── fd: (1)-->(3), (3)-->(1)
├── ordinality
│ ├── columns: x:1(int!null) ordinality:3(int!null)
│ ├── stats: [rows=4000, distinct(1)=5000, null(1)=0, distinct(3)=4000, null(3)=0]
│ ├── stats: [rows=4000, distinct(1)=4000, null(1)=0, distinct(3)=4000, null(3)=0]
│ ├── key: (1)
│ ├── fd: (1)-->(3), (3)-->(1)
│ └── scan a
│ ├── columns: x:1(int!null)
│ ├── stats: [rows=4000, distinct(1)=5000, null(1)=0]
│ ├── stats: [rows=4000, distinct(1)=4000, null(1)=0]
│ └── key: (1)
└── filters
└── (ordinality > 0) AND (ordinality <= 10) [type=bool, outer=(3), constraints=(/3: [/1 - /10]; tight)]
Expand All @@ -118,12 +118,12 @@ select
├── fd: ()-->(1-3)
├── ordinality
│ ├── columns: x:1(int!null) y:2(int) ordinality:3(int!null)
│ ├── stats: [rows=4000, distinct(1)=5000, null(1)=0, distinct(3)=4000, null(3)=0]
│ ├── stats: [rows=4000, distinct(1)=4000, null(1)=0, distinct(3)=4000, null(3)=0]
│ ├── key: (1)
│ ├── fd: (1)-->(2,3), (3)-->(1,2)
│ └── scan a
│ ├── columns: x:1(int!null) y:2(int)
│ ├── stats: [rows=4000, distinct(1)=5000, null(1)=0]
│ ├── stats: [rows=4000, distinct(1)=4000, null(1)=0]
│ ├── key: (1)
│ └── fd: (1)-->(2)
└── filters
Expand Down
Loading

0 comments on commit 13250f3

Please sign in to comment.