Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

planner: improve row count estimation for index range containing correlated columns (#9738) #9937

Merged
merged 3 commits into from
Mar 29, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions cmd/explaintest/r/subquery.result
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,19 @@ HashLeftJoin_8 8000.00 root semi join, inner:TableReader_12, other cond:eq(test.
│ └─TableScan_9 10000.00 cop table:t1, range:[-inf,+inf], keep order:false, stats:pseudo
└─TableReader_12 10000.00 root data:TableScan_11
└─TableScan_11 10000.00 cop table:t2, range:[-inf,+inf], keep order:false, stats:pseudo
drop table if exists t;
create table t(a int primary key, b int, c int, d int, index idx(b,c,d));
insert into t values(1,1,1,1),(2,2,2,2),(3,2,2,2),(4,2,2,2),(5,2,2,2);
analyze table t;
explain select t.c in (select count(*) from t s use index(idx), t t1 where s.b = 1 and s.c = 1 and s.d = t.a and s.a = t1.a) from t;
id count task operator info
Projection_11 5.00 root 9_aux_0
└─Apply_13 5.00 root left outer semi join, inner:StreamAgg_20, other cond:eq(test.t.c, count(*))
├─TableReader_15 5.00 root data:TableScan_14
│ └─TableScan_14 5.00 cop table:t, range:[-inf,+inf], keep order:false
└─StreamAgg_20 1.00 root funcs:count(1)
└─IndexJoin_23 0.50 root inner join, inner:TableReader_22, outer key:s.a, inner key:t1.a
├─IndexReader_27 1.00 root index:IndexScan_26
│ └─IndexScan_26 1.00 cop table:s, index:b, c, d, range: decided by [eq(s.b, 1) eq(s.c, 1) eq(s.d, test.t.a)], keep order:false
└─TableReader_22 1.00 root data:TableScan_21
└─TableScan_21 1.00 cop table:t1, range: decided by [s.a], keep order:false
6 changes: 6 additions & 0 deletions cmd/explaintest/t/subquery.test
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,9 @@ drop table if exists t2;
create table t1(a bigint, b bigint);
create table t2(a bigint, b bigint);
explain select * from t1 where t1.a in (select t1.b + t2.b from t2);

drop table if exists t;
create table t(a int primary key, b int, c int, d int, index idx(b,c,d));
insert into t values(1,1,1,1),(2,2,2,2),(3,2,2,2),(4,2,2,2),(5,2,2,2);
analyze table t;
explain select t.c in (select count(*) from t s use index(idx), t t1 where s.b = 1 and s.c = 1 and s.d = t.a and s.a = t1.a) from t;
4 changes: 2 additions & 2 deletions planner/core/cbo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -678,9 +678,9 @@ func (s *testAnalyzeSuite) TestCorrelatedEstimation(c *C) {
" └─MaxOneRow_13 1.00 root ",
" └─Projection_14 0.80 root concat(cast(t1.a), \",\", cast(t1.b))",
" └─IndexLookUp_21 0.80 root ",
" ├─IndexScan_18 1.00 cop table:t1, index:c, range: decided by [eq(t1.c, test.t.c)], keep order:false",
" ├─IndexScan_18 1.25 cop table:t1, index:c, range: decided by [eq(t1.c, test.t.c)], keep order:false",
" └─Selection_20 0.80 cop eq(t1.a, test.t.a)",
" └─TableScan_19 1.00 cop table:t, keep order:false",
" └─TableScan_19 1.25 cop table:t, keep order:false",
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For reviewers: this row count change is caused by the fact that index on column c exists, so we would not analyze column c, hence estimated NDV of column c would be ndv = float64(ds.statisticTable.Count) * distinctFactor, which is 8, not 10. This behavior is different with master branch, because after #9315, we would analyze column c even if index on column c exists.

))
}

Expand Down
28 changes: 15 additions & 13 deletions planner/core/logical_plans.go
Original file line number Diff line number Diff line change
Expand Up @@ -465,24 +465,26 @@ func (ds *DataSource) deriveIndexPathStats(path *accessPath) (bool, error) {
} else {
path.tableFilters = ds.pushedDownConds
}
corColInAccessConds := false
if path.eqCondCount == len(path.accessConds) {
access, remained := path.splitCorColAccessCondFromFilters()
path.accessConds = append(path.accessConds, access...)
accesses, remained := path.splitCorColAccessCondFromFilters()
path.accessConds = append(path.accessConds, accesses...)
path.tableFilters = remained
if len(access) > 0 {
corColInAccessConds = true
}
}
path.indexFilters, path.tableFilters = splitIndexFilterConditions(path.tableFilters, path.index.Columns, ds.tableInfo)
if corColInAccessConds {
idxHist, ok := ds.stats.HistColl.Indices[path.index.ID]
if ok && !ds.stats.HistColl.Pseudo {
path.countAfterAccess = idxHist.AvgCountPerValue(ds.statisticTable.Count)
} else {
if len(accesses) > 0 && ds.statisticTable.Pseudo {
path.countAfterAccess = ds.statisticTable.PseudoAvgCountPerValue()
} else {
selectivity := path.countAfterAccess / float64(ds.statisticTable.Count)
for i := range accesses {
col := path.idxCols[path.eqCondCount+i]
ndv := ds.getColumnNDV(col.ID)
ndv *= selectivity
if ndv < 1 {
ndv = 1.0
}
path.countAfterAccess = path.countAfterAccess / ndv
}
}
}
path.indexFilters, path.tableFilters = splitIndexFilterConditions(path.tableFilters, path.index.Columns, ds.tableInfo)
// If the `countAfterAccess` is less than `stats.RowCount`, there must be some inconsistent stats info.
// We prefer the `stats.RowCount` because it could use more stats info to calculate the selectivity.
if path.countAfterAccess < ds.stats.RowCount {
Expand Down
21 changes: 14 additions & 7 deletions planner/core/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,19 @@ func (p *baseLogicalPlan) deriveStats() (*property.StatsInfo, error) {
return profile, nil
}

// getColumnNDV computes estimated NDV of specified column using the original
// histogram of `DataSource` which is retrieved from storage(not the derived one).
func (ds *DataSource) getColumnNDV(colID int64) (ndv float64) {
hist, ok := ds.statisticTable.Columns[colID]
if ok && hist.Count > 0 {
factor := float64(ds.statisticTable.Count) / float64(hist.Count)
ndv = float64(hist.NDV) * factor
} else {
ndv = float64(ds.statisticTable.Count) * distinctFactor
}
return ndv
}

func (ds *DataSource) getStatsByFilter(conds expression.CNFExprs) *property.StatsInfo {
profile := &property.StatsInfo{
RowCount: float64(ds.statisticTable.Count),
Expand All @@ -68,13 +81,7 @@ func (ds *DataSource) getStatsByFilter(conds expression.CNFExprs) *property.Stat
UsePseudoStats: ds.statisticTable.Pseudo,
}
for i, col := range ds.Columns {
hist, ok := ds.statisticTable.Columns[col.ID]
if ok && hist.Count > 0 {
factor := float64(ds.statisticTable.Count) / float64(hist.Count)
profile.Cardinality[i] = float64(hist.NDV) * factor
} else {
profile.Cardinality[i] = profile.RowCount * distinctFactor
}
profile.Cardinality[i] = ds.getColumnNDV(col.ID)
}
ds.stats = profile
selectivity, err := profile.HistColl.Selectivity(ds.ctx, conds)
Expand Down