diff --git a/cmd/explaintest/r/subquery.result b/cmd/explaintest/r/subquery.result index 934101ce4d320..2596bc5ae4208 100644 --- a/cmd/explaintest/r/subquery.result +++ b/cmd/explaintest/r/subquery.result @@ -9,3 +9,19 @@ HashLeftJoin_8 8000.00 root semi join, inner:TableReader_12, other cond:eq(test. │ └─TableScan_9 10000.00 cop table:t1, range:[-inf,+inf], keep order:false, stats:pseudo └─TableReader_12 10000.00 root data:TableScan_11 └─TableScan_11 10000.00 cop table:t2, range:[-inf,+inf], keep order:false, stats:pseudo +drop table if exists t; +create table t(a int primary key, b int, c int, d int, index idx(b,c,d)); +insert into t values(1,1,1,1),(2,2,2,2),(3,2,2,2),(4,2,2,2),(5,2,2,2); +analyze table t; +explain select t.c in (select count(*) from t s use index(idx), t t1 where s.b = 1 and s.c = 1 and s.d = t.a and s.a = t1.a) from t; +id count task operator info +Projection_11 5.00 root 9_aux_0 +└─Apply_13 5.00 root left outer semi join, inner:StreamAgg_20, other cond:eq(test.t.c, count(*)) + ├─TableReader_15 5.00 root data:TableScan_14 + │ └─TableScan_14 5.00 cop table:t, range:[-inf,+inf], keep order:false + └─StreamAgg_20 1.00 root funcs:count(1) + └─IndexJoin_23 0.50 root inner join, inner:TableReader_22, outer key:s.a, inner key:t1.a + ├─IndexReader_27 1.00 root index:IndexScan_26 + │ └─IndexScan_26 1.00 cop table:s, index:b, c, d, range: decided by [eq(s.b, 1) eq(s.c, 1) eq(s.d, test.t.a)], keep order:false + └─TableReader_22 1.00 root data:TableScan_21 + └─TableScan_21 1.00 cop table:t1, range: decided by [s.a], keep order:false diff --git a/cmd/explaintest/t/subquery.test b/cmd/explaintest/t/subquery.test index de17ee3e25b5c..be52346acac74 100644 --- a/cmd/explaintest/t/subquery.test +++ b/cmd/explaintest/t/subquery.test @@ -3,3 +3,9 @@ drop table if exists t2; create table t1(a bigint, b bigint); create table t2(a bigint, b bigint); explain select * from t1 where t1.a in (select t1.b + t2.b from t2); + +drop table if exists t; +create table t(a int primary key, b int, c int, d int, index idx(b,c,d)); +insert into t values(1,1,1,1),(2,2,2,2),(3,2,2,2),(4,2,2,2),(5,2,2,2); +analyze table t; +explain select t.c in (select count(*) from t s use index(idx), t t1 where s.b = 1 and s.c = 1 and s.d = t.a and s.a = t1.a) from t; diff --git a/planner/core/cbo_test.go b/planner/core/cbo_test.go index 992cd10540095..f3097b903c26f 100644 --- a/planner/core/cbo_test.go +++ b/planner/core/cbo_test.go @@ -678,9 +678,9 @@ func (s *testAnalyzeSuite) TestCorrelatedEstimation(c *C) { " └─MaxOneRow_13 1.00 root ", " └─Projection_14 0.80 root concat(cast(t1.a), \",\", cast(t1.b))", " └─IndexLookUp_21 0.80 root ", - " ├─IndexScan_18 1.00 cop table:t1, index:c, range: decided by [eq(t1.c, test.t.c)], keep order:false", + " ├─IndexScan_18 1.25 cop table:t1, index:c, range: decided by [eq(t1.c, test.t.c)], keep order:false", " └─Selection_20 0.80 cop eq(t1.a, test.t.a)", - " └─TableScan_19 1.00 cop table:t, keep order:false", + " └─TableScan_19 1.25 cop table:t, keep order:false", )) } diff --git a/planner/core/logical_plans.go b/planner/core/logical_plans.go index 20c912d91eedf..a360c7378f4fb 100644 --- a/planner/core/logical_plans.go +++ b/planner/core/logical_plans.go @@ -465,24 +465,26 @@ func (ds *DataSource) deriveIndexPathStats(path *accessPath) (bool, error) { } else { path.tableFilters = ds.pushedDownConds } - corColInAccessConds := false if path.eqCondCount == len(path.accessConds) { - access, remained := path.splitCorColAccessCondFromFilters() - path.accessConds = append(path.accessConds, access...) + accesses, remained := path.splitCorColAccessCondFromFilters() + path.accessConds = append(path.accessConds, accesses...) path.tableFilters = remained - if len(access) > 0 { - corColInAccessConds = true - } - } - path.indexFilters, path.tableFilters = splitIndexFilterConditions(path.tableFilters, path.index.Columns, ds.tableInfo) - if corColInAccessConds { - idxHist, ok := ds.stats.HistColl.Indices[path.index.ID] - if ok && !ds.stats.HistColl.Pseudo { - path.countAfterAccess = idxHist.AvgCountPerValue(ds.statisticTable.Count) - } else { + if len(accesses) > 0 && ds.statisticTable.Pseudo { path.countAfterAccess = ds.statisticTable.PseudoAvgCountPerValue() + } else { + selectivity := path.countAfterAccess / float64(ds.statisticTable.Count) + for i := range accesses { + col := path.idxCols[path.eqCondCount+i] + ndv := ds.getColumnNDV(col.ID) + ndv *= selectivity + if ndv < 1 { + ndv = 1.0 + } + path.countAfterAccess = path.countAfterAccess / ndv + } } } + path.indexFilters, path.tableFilters = splitIndexFilterConditions(path.tableFilters, path.index.Columns, ds.tableInfo) // If the `countAfterAccess` is less than `stats.RowCount`, there must be some inconsistent stats info. // We prefer the `stats.RowCount` because it could use more stats info to calculate the selectivity. if path.countAfterAccess < ds.stats.RowCount { diff --git a/planner/core/stats.go b/planner/core/stats.go index 393ebb42e97b8..91a80d096a7e3 100644 --- a/planner/core/stats.go +++ b/planner/core/stats.go @@ -60,6 +60,19 @@ func (p *baseLogicalPlan) deriveStats() (*property.StatsInfo, error) { return profile, nil } +// getColumnNDV computes estimated NDV of specified column using the original +// histogram of `DataSource` which is retrieved from storage(not the derived one). +func (ds *DataSource) getColumnNDV(colID int64) (ndv float64) { + hist, ok := ds.statisticTable.Columns[colID] + if ok && hist.Count > 0 { + factor := float64(ds.statisticTable.Count) / float64(hist.Count) + ndv = float64(hist.NDV) * factor + } else { + ndv = float64(ds.statisticTable.Count) * distinctFactor + } + return ndv +} + func (ds *DataSource) getStatsByFilter(conds expression.CNFExprs) *property.StatsInfo { profile := &property.StatsInfo{ RowCount: float64(ds.statisticTable.Count), @@ -68,13 +81,7 @@ func (ds *DataSource) getStatsByFilter(conds expression.CNFExprs) *property.Stat UsePseudoStats: ds.statisticTable.Pseudo, } for i, col := range ds.Columns { - hist, ok := ds.statisticTable.Columns[col.ID] - if ok && hist.Count > 0 { - factor := float64(ds.statisticTable.Count) / float64(hist.Count) - profile.Cardinality[i] = float64(hist.NDV) * factor - } else { - profile.Cardinality[i] = profile.RowCount * distinctFactor - } + profile.Cardinality[i] = ds.getColumnNDV(col.ID) } ds.stats = profile selectivity, err := profile.HistColl.Selectivity(ds.ctx, conds)