diff --git a/pkg/sql/opt/memo/statistics_builder.go b/pkg/sql/opt/memo/statistics_builder.go index 261e7152b9d3..910f4e348df3 100644 --- a/pkg/sql/opt/memo/statistics_builder.go +++ b/pkg/sql/opt/memo/statistics_builder.go @@ -550,6 +550,7 @@ func (sb *statisticsBuilder) colStatScan(colSet opt.ColSet, scan *ScanExpr) *pro colStat.NullCount = 0 } + sb.finalizeFromRowCount(colStat, s.RowCount) return colStat } @@ -574,7 +575,9 @@ func (sb *statisticsBuilder) colStatVirtualScan( colSet opt.ColSet, scan *VirtualScanExpr, ) *props.ColumnStatistic { s := &scan.Relational().Stats - return sb.copyColStat(colSet, s, sb.colStatTable(scan.Table, colSet)) + colStat := sb.copyColStat(colSet, s, sb.colStatTable(scan.Table, colSet)) + sb.finalizeFromRowCount(colStat, s.RowCount) + return colStat } // +--------+ @@ -655,6 +658,7 @@ func (sb *statisticsBuilder) colStatSelect( if colSet.SubsetOf(relProps.NotNullCols) { colStat.NullCount = 0 } + sb.finalizeFromRowCount(colStat, s.RowCount) return colStat } @@ -739,6 +743,7 @@ func (sb *statisticsBuilder) colStatProject( if colSet.SubsetOf(relProps.NotNullCols) { colStat.NullCount = 0 } + sb.finalizeFromRowCount(colStat, s.RowCount) return colStat } @@ -962,6 +967,7 @@ func (sb *statisticsBuilder) colStatJoin(colSet opt.ColSet, join RelExpr) *props // Column stats come from left side of join. colStat := sb.copyColStat(colSet, s, sb.colStatFromJoinLeft(colSet, join)) colStat.ApplySelectivity(s.Selectivity, leftProps.Stats.RowCount) + sb.finalizeFromRowCount(colStat, s.RowCount) return colStat default: @@ -1048,15 +1054,10 @@ func (sb *statisticsBuilder) colStatJoin(colSet opt.ColSet, join RelExpr) *props ) } - // The distinct count should be no larger than the row count. - if colStat.DistinctCount > s.RowCount { - colStat.DistinctCount = s.RowCount - } - // Similarly, the null count should be no larger than RowCount. - colStat.NullCount = min(s.RowCount, colStat.NullCount) if colSet.SubsetOf(relProps.NotNullCols) { colStat.NullCount = 0 } + sb.finalizeFromRowCount(colStat, s.RowCount) return colStat } } @@ -1244,15 +1245,10 @@ func (sb *statisticsBuilder) colStatIndexJoin( colStat.NullCount = inputStats.RowCount * (f1 + f2 - f1*f2) } - // The distinct count should be no larger than the row count. - if colStat.DistinctCount > s.RowCount { - colStat.DistinctCount = s.RowCount - } - // Similarly, the null count should be no larger than RowCount. - colStat.NullCount = min(s.RowCount, colStat.NullCount) if colSet.SubsetOf(relProps.NotNullCols) { colStat.NullCount = 0 } + sb.finalizeFromRowCount(colStat, s.RowCount) return colStat } @@ -1420,11 +1416,11 @@ func (sb *statisticsBuilder) colStatGroupBy( inputRowCount := sb.statsFromChild(groupNode, 0 /* childIdx */).RowCount colStat.NullCount = ((colStat.DistinctCount + 1) / inputRowCount) * inputColStat.NullCount } - colStat.NullCount = min(s.RowCount, colStat.NullCount) if colSet.SubsetOf(relProps.NotNullCols) { colStat.NullCount = 0 } + sb.finalizeFromRowCount(colStat, s.RowCount) return colStat } @@ -1521,6 +1517,7 @@ func (sb *statisticsBuilder) colStatSetNodeImpl( if outputCols.SubsetOf(relProps.NotNullCols) { colStat.NullCount = 0 } + sb.finalizeFromRowCount(colStat, s.RowCount) return colStat } @@ -1585,6 +1582,7 @@ func (sb *statisticsBuilder) colStatValues( colStat, _ := s.ColStats.Add(colSet) colStat.DistinctCount = float64(len(distinct)) colStat.NullCount = float64(nullCount) + sb.finalizeFromRowCount(colStat, s.RowCount) return colStat } @@ -1629,6 +1627,7 @@ func (sb *statisticsBuilder) colStatLimit( if colSet.SubsetOf(relProps.NotNullCols) { colStat.NullCount = 0 } + sb.finalizeFromRowCount(colStat, s.RowCount) return colStat } @@ -1675,6 +1674,7 @@ func (sb *statisticsBuilder) colStatOffset( if colSet.SubsetOf(relProps.NotNullCols) { colStat.NullCount = 0 } + sb.finalizeFromRowCount(colStat, s.RowCount) return colStat } @@ -1703,6 +1703,7 @@ func (sb *statisticsBuilder) colStatMax1Row( if colSet.SubsetOf(max1Row.Relational().NotNullCols) { colStat.NullCount = 0 } + sb.finalizeFromRowCount(colStat, s.RowCount) return colStat } @@ -1753,6 +1754,7 @@ func (sb *statisticsBuilder) colStatOrdinality( if colSet.SubsetOf(relProps.NotNullCols) { colStat.NullCount = 0 } + sb.finalizeFromRowCount(colStat, s.RowCount) return colStat } @@ -1817,6 +1819,7 @@ func (sb *statisticsBuilder) colStatWindow( if colSet.SubsetOf(relProps.NotNullCols) { colStat.NullCount = 0 } + sb.finalizeFromRowCount(colStat, s.RowCount) return colStat } @@ -1944,13 +1947,10 @@ func (sb *statisticsBuilder) colStatProjectSet( colStat.NullCount = s.RowCount * (f1 + f2 - f1*f2) } - // The distinct count and null count should be no larger than the row count. - colStat.DistinctCount = min(s.RowCount, colStat.DistinctCount) - colStat.NullCount = min(s.RowCount, colStat.NullCount) - if colSet.SubsetOf(projectSet.Relational().NotNullCols) { colStat.NullCount = 0 } + sb.finalizeFromRowCount(colStat, s.RowCount) return colStat } @@ -1986,6 +1986,7 @@ func (sb *statisticsBuilder) colStatMutation( colStat, _ := s.ColStats.Add(colSet) colStat.DistinctCount = inColStat.DistinctCount colStat.NullCount = inColStat.NullCount + sb.finalizeFromRowCount(colStat, s.RowCount) return colStat } @@ -2007,6 +2008,7 @@ func (sb *statisticsBuilder) colStatSequenceSelect( colStat, _ := s.ColStats.Add(colSet) colStat.DistinctCount = 1 colStat.NullCount = 0 + sb.finalizeFromRowCount(colStat, s.RowCount) return colStat } @@ -2100,6 +2102,17 @@ func translateColSet(colSetIn opt.ColSet, from opt.ColList, to opt.ColList) opt. func (sb *statisticsBuilder) finalizeFromCardinality(relProps *props.Relational) { s := &relProps.Stats + + // We don't ever want row count = 0 unless the cardinality is zero. + // This is because the stats may be stale, and we can end up with weird and + // inefficient plans if we estimate 0 rows. + // + // Increment the row count here if necessary, but it may be reduced below if + // the cardinality is 0. + if s.RowCount <= 0 { + s.RowCount = 1 + } + // The row count should be between the min and max cardinality. if s.RowCount > float64(relProps.Cardinality.Max) && relProps.Cardinality.Max != math.MaxUint32 { s.RowCount = float64(relProps.Cardinality.Max) @@ -2107,14 +2120,26 @@ func (sb *statisticsBuilder) finalizeFromCardinality(relProps *props.Relational) s.RowCount = float64(relProps.Cardinality.Min) } - // The distinct and null counts should be no larger than the row count. for i, n := 0, s.ColStats.Count(); i < n; i++ { colStat := s.ColStats.Get(i) - colStat.DistinctCount = min(colStat.DistinctCount, s.RowCount) - colStat.NullCount = min(colStat.NullCount, s.RowCount) + sb.finalizeFromRowCount(colStat, s.RowCount) } } +func (sb *statisticsBuilder) finalizeFromRowCount( + colStat *props.ColumnStatistic, rowCount float64, +) { + // We should always have at least one distinct or null value if + // row count > 0. + if rowCount > 0 && colStat.DistinctCount == 0 && colStat.NullCount == 0 { + colStat.DistinctCount = 1 + } + + // The distinct and null counts should be no larger than the row count. + colStat.DistinctCount = min(colStat.DistinctCount, rowCount) + colStat.NullCount = min(colStat.NullCount, rowCount) +} + func min(a float64, b float64) float64 { if a < b { return a diff --git a/pkg/sql/opt/memo/testdata/memo b/pkg/sql/opt/memo/testdata/memo index 70b149b2abfa..30b96b7bf1c0 100644 --- a/pkg/sql/opt/memo/testdata/memo +++ b/pkg/sql/opt/memo/testdata/memo @@ -357,7 +357,7 @@ memo (optimized, ~5KB, required=[presentation: field:3]) ├── G1: (distinct-on G2 G3 cols=(3)) │ └── [presentation: field:3] │ ├── best: (distinct-on G2 G3 cols=(3)) - │ └── cost: 0.04 + │ └── cost: 0.05 ├── G2: (explain G4 [presentation: k:1]) │ └── [] │ ├── best: (explain G4="[presentation: k:1]" [presentation: k:1]) @@ -379,7 +379,7 @@ memo (optimized, ~2KB, required=[presentation: tag:4]) ├── G1: (distinct-on G2 G3 cols=(4)) │ └── [presentation: tag:4] │ ├── best: (distinct-on G2 G3 cols=(4)) - │ └── cost: 0.02 + │ └── cost: 0.03 ├── G2: (show-trace-for-session &{TRACE false [1 2 3 4 5 6 7]}) │ └── [] │ ├── best: (show-trace-for-session &{TRACE false [1 2 3 4 5 6 7]}) diff --git a/pkg/sql/opt/memo/testdata/stats/groupby b/pkg/sql/opt/memo/testdata/stats/groupby index 623ef1d59f31..049ab2027758 100644 --- a/pkg/sql/opt/memo/testdata/stats/groupby +++ b/pkg/sql/opt/memo/testdata/stats/groupby @@ -479,12 +479,12 @@ GROUP BY q.b project ├── columns: "?column?":4(int!null) ├── cardinality: [0 - 3] - ├── stats: [rows=0] + ├── stats: [rows=1] ├── fd: ()-->(4) ├── select │ ├── columns: column2:2(int) bool_or:3(bool!null) │ ├── cardinality: [0 - 3] - │ ├── stats: [rows=0, distinct(3)=0, null(3)=0] + │ ├── stats: [rows=1, distinct(3)=1, null(3)=0] │ ├── key: (2) │ ├── fd: ()-->(3) │ ├── group-by diff --git a/pkg/sql/opt/memo/testdata/stats/limit b/pkg/sql/opt/memo/testdata/stats/limit index af7f68abf09d..d4c4c2bc1872 100644 --- a/pkg/sql/opt/memo/testdata/stats/limit +++ b/pkg/sql/opt/memo/testdata/stats/limit @@ -209,3 +209,39 @@ limit │ └── filters │ └── s = 'foo' [type=bool, outer=(3), constraints=(/3: [/'foo' - /'foo']; tight), fd=()-->(3)] └── const: 5 [type=int] + +exec-ddl +CREATE TABLE b (x int) +---- +TABLE b + ├── x int + ├── rowid int not null (hidden) + └── INDEX primary + └── rowid int not null (hidden) + +# Regression test for #32578. Ensure that we don't estimate 0 rows for the +# offset. +opt colstat=1 +SELECT * FROM b ORDER BY x LIMIT 1 OFFSET 9999 +---- +limit + ├── columns: x:1(int) + ├── internal-ordering: +1 + ├── cardinality: [0 - 1] + ├── stats: [rows=1, distinct(1)=1, null(1)=0] + ├── key: () + ├── fd: ()-->(1) + ├── offset + │ ├── columns: x:1(int) + │ ├── internal-ordering: +1 + │ ├── stats: [rows=1, distinct(1)=1, null(1)=0] + │ ├── ordering: +1 + │ ├── sort + │ │ ├── columns: x:1(int) + │ │ ├── stats: [rows=1000, distinct(1)=100, null(1)=10] + │ │ ├── ordering: +1 + │ │ └── scan b + │ │ ├── columns: x:1(int) + │ │ └── stats: [rows=1000, distinct(1)=100, null(1)=10] + │ └── const: 9999 [type=int] + └── const: 1 [type=int] diff --git a/pkg/sql/opt/memo/testdata/stats/ordinality b/pkg/sql/opt/memo/testdata/stats/ordinality index d6e48a45a272..40155e944f83 100644 --- a/pkg/sql/opt/memo/testdata/stats/ordinality +++ b/pkg/sql/opt/memo/testdata/stats/ordinality @@ -34,12 +34,12 @@ select ├── fd: (1)-->(2,3), (3)-->(1,2) ├── ordinality │ ├── columns: x:1(int!null) y:2(int) ordinality:3(int!null) - │ ├── stats: [rows=4000, distinct(1)=5000, null(1)=0, distinct(3)=4000, null(3)=0] + │ ├── stats: [rows=4000, distinct(1)=4000, null(1)=0, distinct(3)=4000, null(3)=0] │ ├── key: (1) │ ├── fd: (1)-->(2,3), (3)-->(1,2) │ └── scan a │ ├── columns: x:1(int!null) y:2(int) - │ ├── stats: [rows=4000, distinct(1)=5000, null(1)=0] + │ ├── stats: [rows=4000, distinct(1)=4000, null(1)=0] │ ├── key: (1) │ └── fd: (1)-->(2) └── filters @@ -55,12 +55,12 @@ select ├── fd: (1)-->(2,3), (3)-->(1,2) ├── ordinality │ ├── columns: x:1(int!null) y:2(int) ordinality:3(int!null) - │ ├── stats: [rows=4000, distinct(1)=5000, null(1)=0, distinct(2)=400, null(2)=0, distinct(3)=4000, null(3)=0] + │ ├── stats: [rows=4000, distinct(1)=4000, null(1)=0, distinct(2)=400, null(2)=0, distinct(3)=4000, null(3)=0] │ ├── key: (1) │ ├── fd: (1)-->(2,3), (3)-->(1,2) │ └── scan a │ ├── columns: x:1(int!null) y:2(int) - │ ├── stats: [rows=4000, distinct(1)=5000, null(1)=0, distinct(2)=400, null(2)=0] + │ ├── stats: [rows=4000, distinct(1)=4000, null(1)=0, distinct(2)=400, null(2)=0] │ ├── key: (1) │ └── fd: (1)-->(2) └── filters @@ -96,12 +96,12 @@ project ├── fd: (1)-->(3), (3)-->(1) ├── ordinality │ ├── columns: x:1(int!null) ordinality:3(int!null) - │ ├── stats: [rows=4000, distinct(1)=5000, null(1)=0, distinct(3)=4000, null(3)=0] + │ ├── stats: [rows=4000, distinct(1)=4000, null(1)=0, distinct(3)=4000, null(3)=0] │ ├── key: (1) │ ├── fd: (1)-->(3), (3)-->(1) │ └── scan a │ ├── columns: x:1(int!null) - │ ├── stats: [rows=4000, distinct(1)=5000, null(1)=0] + │ ├── stats: [rows=4000, distinct(1)=4000, null(1)=0] │ └── key: (1) └── filters └── (ordinality > 0) AND (ordinality <= 10) [type=bool, outer=(3), constraints=(/3: [/1 - /10]; tight)] @@ -118,12 +118,12 @@ select ├── fd: ()-->(1-3) ├── ordinality │ ├── columns: x:1(int!null) y:2(int) ordinality:3(int!null) - │ ├── stats: [rows=4000, distinct(1)=5000, null(1)=0, distinct(3)=4000, null(3)=0] + │ ├── stats: [rows=4000, distinct(1)=4000, null(1)=0, distinct(3)=4000, null(3)=0] │ ├── key: (1) │ ├── fd: (1)-->(2,3), (3)-->(1,2) │ └── scan a │ ├── columns: x:1(int!null) y:2(int) - │ ├── stats: [rows=4000, distinct(1)=5000, null(1)=0] + │ ├── stats: [rows=4000, distinct(1)=4000, null(1)=0] │ ├── key: (1) │ └── fd: (1)-->(2) └── filters diff --git a/pkg/sql/opt/memo/testdata/stats/select b/pkg/sql/opt/memo/testdata/stats/select index 36f3a76a93e0..6d3b0a749074 100644 --- a/pkg/sql/opt/memo/testdata/stats/select +++ b/pkg/sql/opt/memo/testdata/stats/select @@ -110,7 +110,7 @@ select ├── fd: (1)-->(2) ├── scan a │ ├── columns: x:1(int!null) y:2(int) - │ ├── stats: [rows=4000, distinct(1)=5000, null(1)=0] + │ ├── stats: [rows=4000, distinct(1)=4000, null(1)=0] │ ├── key: (1) │ └── fd: (1)-->(2) └── filters @@ -127,7 +127,7 @@ select ├── fd: ()-->(2) ├── scan a │ ├── columns: x:1(int!null) y:2(int) - │ ├── stats: [rows=4000, distinct(1)=5000, null(1)=0, distinct(2)=400, null(2)=0] + │ ├── stats: [rows=4000, distinct(1)=4000, null(1)=0, distinct(2)=400, null(2)=0] │ ├── key: (1) │ └── fd: (1)-->(2) └── filters @@ -141,12 +141,12 @@ SELECT * FROM a WHERE x IS NULL select ├── columns: x:1(int!null) y:2(int) ├── cardinality: [0 - 1] - ├── stats: [rows=0.8, distinct(1)=0.8, null(1)=0] + ├── stats: [rows=1, distinct(1)=1, null(1)=0] ├── key: () ├── fd: ()-->(1,2) ├── scan a │ ├── columns: x:1(int!null) y:2(int) - │ ├── stats: [rows=4000, distinct(1)=5000, null(1)=0] + │ ├── stats: [rows=4000, distinct(1)=4000, null(1)=0] │ ├── key: (1) │ └── fd: (1)-->(2) └── filters @@ -263,11 +263,11 @@ SELECT count(*) FROM (SELECT * FROM tab0 WHERE col3 = 10) GROUP BY col0 ---- project ├── columns: count:8(int) - ├── stats: [rows=0] + ├── stats: [rows=1] └── group-by ├── columns: col0:2(int) count_rows:8(int) ├── grouping columns: col0:2(int) - ├── stats: [rows=0, distinct(2)=0, null(2)=0] + ├── stats: [rows=1, distinct(2)=0, null(2)=1] ├── key: (2) ├── fd: (2)-->(8) ├── select diff --git a/pkg/sql/opt/memo/testdata/stats_quality/tpcc b/pkg/sql/opt/memo/testdata/stats_quality/tpcc index bf7d21d6321f..6df959e0e030 100644 --- a/pkg/sql/opt/memo/testdata/stats_quality/tpcc +++ b/pkg/sql/opt/memo/testdata/stats_quality/tpcc @@ -1497,7 +1497,7 @@ project ├── save-table-name: delivery_01_project_1 ├── columns: no_o_id:1(int!null) ├── cardinality: [0 - 1] - ├── stats: [rows=1, distinct(1)=94.6943635, null(1)=0] + ├── stats: [rows=1, distinct(1)=1, null(1)=0] ├── cost: 1.09 ├── key: () ├── fd: ()-->(1) @@ -1507,7 +1507,7 @@ project ├── columns: no_o_id:1(int!null) no_d_id:2(int!null) no_w_id:3(int!null) ├── constraint: /3/2/-1: [/7/6 - /7/6] ├── limit: 1(rev) - ├── stats: [rows=1, distinct(1)=94.6943635, null(1)=0, distinct(2)=9.99954852, null(2)=0, distinct(3)=9.99954852, null(3)=0] + ├── stats: [rows=1, distinct(1)=1, null(1)=0, distinct(2)=1, null(2)=0, distinct(3)=1, null(3)=0] ├── cost: 1.07 ├── key: () ├── fd: ()-->(1-3) @@ -1522,9 +1522,9 @@ column_names row_count distinct_count null_count {no_w_id} 1 1 0 ~~~~ column_names row_count_est row_count_err distinct_count_est distinct_count_err null_count_est null_count_err -{no_d_id} 1.00 1.00 10.00 10.00 <== 0.00 1.00 -{no_o_id} 1.00 1.00 95.00 95.00 <== 0.00 1.00 -{no_w_id} 1.00 1.00 10.00 10.00 <== 0.00 1.00 +{no_d_id} 1.00 1.00 1.00 1.00 0.00 1.00 +{no_o_id} 1.00 1.00 1.00 1.00 0.00 1.00 +{no_w_id} 1.00 1.00 1.00 1.00 0.00 1.00 save-tables format=hide-qual database=tpcc save-tables-prefix=delivery_02 SELECT sum(ol_amount) diff --git a/pkg/sql/opt/xform/testdata/external/tpcc b/pkg/sql/opt/xform/testdata/external/tpcc index 2fe4aedf8ee0..4b415d348b9c 100644 --- a/pkg/sql/opt/xform/testdata/external/tpcc +++ b/pkg/sql/opt/xform/testdata/external/tpcc @@ -1104,8 +1104,8 @@ ORDER BY i_id scan item ├── columns: i_price:4(decimal) i_name:3(varchar) i_data:5(varchar) [hidden: i_id:1(int!null)] ├── constraint: /1: [/25 - /25] [/50 - /50] [/75 - /75] [/100 - /100] [/125 - /125] [/150 - /150] [/175 - /175] [/200 - /200] [/225 - /225] [/250 - /250] [/275 - /275] [/300 - /300] - ├── stats: [rows=11.8491602, distinct(1)=11.8491602, null(1)=0] - ├── cost: 12.9255846 + ├── stats: [rows=12, distinct(1)=12, null(1)=0] + ├── cost: 13.09 ├── key: (1) ├── fd: (1)-->(3-5) ├── ordering: +1 @@ -2735,8 +2735,8 @@ ORDER BY i_id scan item ├── columns: i_price:4(decimal) i_name:3(varchar) i_data:5(varchar) [hidden: i_id:1(int!null)] ├── constraint: /1: [/25 - /25] [/50 - /50] [/75 - /75] [/100 - /100] [/125 - /125] [/150 - /150] [/175 - /175] [/200 - /200] [/225 - /225] [/250 - /250] [/275 - /275] [/300 - /300] - ├── stats: [rows=11.8491602, distinct(1)=11.8491602, null(1)=0] - ├── cost: 12.9255846 + ├── stats: [rows=12, distinct(1)=12, null(1)=0] + ├── cost: 13.09 ├── key: (1) ├── fd: (1)-->(3-5) ├── ordering: +1