Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

planner, statistics: maintain histogram for inner join #8097

Closed
wants to merge 20 commits into from
Closed
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions planner/core/cbo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -747,6 +747,32 @@ func (s *testAnalyzeSuite) TestInconsistentEstimation(c *C) {
))
}

func (s *testAnalyzeSuite) TestJoinWithHistogram(c *C) {
defer testleak.AfterTest(c)()
store, dom, err := newStoreWithBootstrap()
c.Assert(err, IsNil)
tk := testkit.NewTestKit(c, store)
defer func() {
dom.Close()
store.Close()
}()
tk.MustExec("use test")
tk.MustExec("create table t(a int primary key, b int, index idx(b))")
tk.MustExec("create table tt(a int primary key, b int, index idx(b))")
tk.MustExec("insert into t values(1, 1), (2, 1), (3, 1), (4, 2), (5, 2), (6, 2), (7, 3), (8, 4), (9, 5)")
tk.MustExec("insert into tt values(1, 1), (3, 1), (5, 1), (7, 2), (9, 3), (15, 4)")
tk.MustExec("analyze table t, tt")
tk.MustExec("set @@session.tidb_optimizer_selectivity_level=1")
tk.MustQuery("explain select * from t t1 join tt t2 where t1.a=t2.a").Check(testkit.Rows(
"Projection_7 5.00 root t1.a, t1.b, t2.a, t2.b",
"└─IndexJoin_11 5.00 root inner join, inner:TableReader_10, outer key:t2.a, inner key:t1.a",
" ├─TableReader_24 6.00 root data:TableScan_23",
" │ └─TableScan_23 6.00 cop table:t2, range:[-inf,+inf], keep order:false",
" └─TableReader_10 1.00 root data:TableScan_9",
" └─TableScan_9 1.00 cop table:t1, range: decided by [t2.a], keep order:false",
))
}

func newStoreWithBootstrap() (kv.Storage, *domain.Domain, error) {
store, err := mockstore.NewMockTikvStore()
if err != nil {
Expand Down
80 changes: 80 additions & 0 deletions planner/core/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,9 @@ func (p *LogicalJoin) DeriveStats(childStats []*property.StatsInfo) (*property.S
leftKeys = append(leftKeys, eqCond.GetArgs()[0].(*expression.Column))
rightKeys = append(rightKeys, eqCond.GetArgs()[1].(*expression.Column))
}
if p.JoinType == InnerJoin && p.ctx.GetSessionVars().OptimizerSelectivityLevel >= 1 {
return p.deriveInnerJoinStatsWithHist(leftKeys, rightKeys)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shall we pass the childStats parameter down to deriveInnerJoinStatsWithHist? so this DeriveStats function can be used by cascades planner as well.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, using childStats in new commits. But it uses the schema of join's children inside this method. Seems that i need some way to not rely on it.

}
leftKeyCardinality := getCardinality(leftKeys, p.children[0].Schema(), leftProfile)
rightKeyCardinality := getCardinality(rightKeys, p.children[1].Schema(), rightProfile)
count := leftProfile.RowCount * rightProfile.RowCount / math.Max(leftKeyCardinality, rightKeyCardinality)
Expand All @@ -302,6 +305,83 @@ func (p *LogicalJoin) DeriveStats(childStats []*property.StatsInfo) (*property.S
return p.stats, nil
}

func (p *LogicalJoin) deriveInnerJoinStatsWithHist(leftKeys, rightKeys []*expression.Column) (*property.StatsInfo, error) {
leftChild, rightChild := p.children[0], p.children[1]
leftProfile, rightProfile := leftChild.statsInfo(), rightChild.statsInfo()

cardinality := make([]float64, 0, p.schema.Len())
cardinality = append(cardinality, leftProfile.Cardinality...)
cardinality = append(cardinality, rightProfile.Cardinality...)

ndv, leftNdv, rightNdv := float64(1), float64(1), float64(1)
newColID2Hist := make(map[int64]*statistics.Column)

// TODO: Support using index histogram to calculate the NDV after join and the final row count.
for i := range leftKeys {
leftHist, ok1 := leftChild.statsInfo().HistColl.Columns[leftKeys[i].UniqueID]
rightHist, ok2 := rightChild.statsInfo().HistColl.Columns[rightKeys[i].UniqueID]
lPos := leftChild.Schema().ColumnIndex(leftKeys[i])
rPos := rightChild.Schema().ColumnIndex(rightKeys[i])
leftNdv *= leftProfile.Cardinality[lPos]
rightNdv *= rightProfile.Cardinality[rPos]
if ok1 && ok2 {
eurekaka marked this conversation as resolved.
Show resolved Hide resolved
newHist := statistics.MergeHistogramForInnerJoin(&leftHist.Histogram, &rightHist.Histogram, leftKeys[i].RetType)
leftCol := &statistics.Column{Info: leftHist.Info, Histogram: *newHist}
rightCol := &statistics.Column{Info: rightHist.Info, Histogram: *newHist}
lIncreaseFactor := leftHist.GetIncreaseFactor(leftChild.statsInfo().HistColl.Count)
// The factor is used to scale the NDV. When it's higher than one. NDV doesn't need to be changed.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why don't we change NDV if this factor is larger than one?

if lIncreaseFactor > 1 {
lIncreaseFactor = 1
}
rIncreaseFactor := rightHist.GetIncreaseFactor(rightChild.statsInfo().HistColl.Count)
if rIncreaseFactor > 1 {
rIncreaseFactor = 1
}
ndv *= float64(newHist.NDV) * lIncreaseFactor * rIncreaseFactor
lPosNew := p.schema.ColumnIndex(leftKeys[i])
rPosNew := p.schema.ColumnIndex(rightKeys[i])
cardinality[lPosNew] = float64(newHist.NDV)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not multiply this cardinality by lIncreaseFactor like keyNdv.

cardinality[rPosNew] = float64(newHist.NDV)
newColID2Hist[leftKeys[i].UniqueID] = leftCol
newColID2Hist[rightKeys[i].UniqueID] = rightCol
continue
}
keyNdv := math.Min(leftChild.statsInfo().Cardinality[lPos], rightChild.statsInfo().Cardinality[rPos])
ndv *= keyNdv
}
count := leftProfile.RowCount / leftNdv * rightProfile.RowCount / rightNdv * ndv

// Update left column map in `HistColl`.
for uniqID, colHist := range leftProfile.HistColl.Columns {
_, ok := newColID2Hist[uniqID]
if ok {
continue
}
newColID2Hist[uniqID] = colHist
winoros marked this conversation as resolved.
Show resolved Hide resolved
}

// Update right column map in `HistColl`.
for uniqID, colHist := range rightProfile.HistColl.Columns {
_, ok := newColID2Hist[uniqID]
if ok {
continue
}
newColID2Hist[uniqID] = colHist
}
eurekaka marked this conversation as resolved.
Show resolved Hide resolved

// TODO: support calculate index histogram.
newHistColl := &statistics.HistColl{
Count: int64(count),
Columns: newColID2Hist,
}
p.stats = &property.StatsInfo{
RowCount: count,
Cardinality: cardinality,
HistColl: newHistColl,
}
return p.stats, nil
}

// DeriveStats implement LogicalPlan DeriveStats interface.
func (la *LogicalApply) DeriveStats(childStats []*property.StatsInfo) (*property.StatsInfo, error) {
leftProfile := childStats[0]
Expand Down
150 changes: 150 additions & 0 deletions planner/core/stats_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
// Copyright 2018 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.

package core

import (
"encoding/json"
"io/ioutil"
"path/filepath"

. "github.com/pingcap/check"
"github.com/pingcap/errors"
"github.com/pingcap/parser"
"github.com/pingcap/parser/ast"
"github.com/pingcap/parser/model"
"github.com/pingcap/tidb/ddl"
"github.com/pingcap/tidb/expression"
"github.com/pingcap/tidb/planner/property"
"github.com/pingcap/tidb/sessionctx"
"github.com/pingcap/tidb/statistics"
"github.com/pingcap/tidb/util/mock"
)

var _ = Suite(&testStatsSuite{})

type testStatsSuite struct {
ctx sessionctx.Context
}

func (s *testStatsSuite) SetUpTest(c *C) {
s.ctx = mock.NewContext()
}

func (s *testStatsSuite) loadStatsByFileName(t string, tbl *model.TableInfo) (*statistics.HistColl, error) {
statsTbl := &statistics.JSONTable{}
path := filepath.Join("testdata", t)
bytes, err := ioutil.ReadFile(path)
if err != nil {
return nil, errors.Trace(err)
}
err = json.Unmarshal(bytes, statsTbl)
if err != nil {
return nil, errors.Trace(err)
}

coll, err := statistics.HistCollFromJSON(tbl, tbl.ID, statsTbl)

return coll, err
}

func (s *testStatsSuite) mockTableInfo(sql string, tblID int64) (*model.TableInfo, error) {
stmt, err := parser.New().ParseOneStmt(sql, "", "")
if err != nil {
return nil, errors.Trace(err)
}
mockTbl, err := ddl.MockTableInfo(mock.NewContext(), stmt.(*ast.CreateTableStmt), tblID)
return mockTbl, err
}

func (s *testStatsSuite) TestInnerJoinStats(c *C) {
// t1's values are (i, i) where i = 1..9. Each pair appears once.
t1Tbl, err := s.mockTableInfo("create table t1(a int, b int, index a(a, b))", 1)
c.Assert(err, IsNil)
t1Coll, err := s.loadStatsByFileName("t1.json", t1Tbl)
alivxxx marked this conversation as resolved.
Show resolved Hide resolved
c.Assert(err, IsNil)
t1ExprCols := expression.ColumnInfos2ColumnsWithDBName(s.ctx, model.NewCIStr("test"), t1Tbl.Name, t1Tbl.Columns)
t1FinalColl := t1Coll.GenerateHistCollFromColumnInfo(t1Tbl.Columns, t1ExprCols)
t1StatsInfo := &property.StatsInfo{
RowCount: float64(t1Coll.Count),
Cardinality: make([]float64, len(t1ExprCols)),
HistColl: t1FinalColl,
}
for i := range t1ExprCols {
t1StatsInfo.Cardinality[i] = float64(t1FinalColl.Columns[t1ExprCols[i].UniqueID].NDV)
}
t1Child := DataSource{}.Init(s.ctx)
t1Child.schema = expression.NewSchema(t1ExprCols...)
t1Child.stats = t1StatsInfo

t2Tbl, err := s.mockTableInfo("create table t2(a int, b int, index a(a, b))", 2)
winoros marked this conversation as resolved.
Show resolved Hide resolved
c.Assert(err, IsNil)
// t2's values are (i, i) where i = 8..15. Each pair appears once.
t2Coll, err := s.loadStatsByFileName("t2.json", t2Tbl)
c.Assert(err, IsNil)
t2ExprCols := expression.ColumnInfos2ColumnsWithDBName(s.ctx, model.NewCIStr("test"), t2Tbl.Name, t2Tbl.Columns)
t2FinalColl := t2Coll.GenerateHistCollFromColumnInfo(t2Tbl.Columns, t2ExprCols)
t2StatsInfo := &property.StatsInfo{
RowCount: float64(t2Coll.Count),
Cardinality: make([]float64, len(t2ExprCols)),
HistColl: t2FinalColl,
}
for i := range t2ExprCols {
t2StatsInfo.Cardinality[i] = float64(t2FinalColl.Columns[t2ExprCols[i].UniqueID].NDV)
}
t2Child := DataSource{}.Init(s.ctx)
t2Child.schema = expression.NewSchema(t2ExprCols...)
t2Child.stats = t2StatsInfo

join := LogicalJoin{}.Init(s.ctx)
join.SetChildren(t1Child, t2Child)
join.schema = expression.MergeSchema(t1Child.schema, t2Child.schema)
finalStats, err := join.deriveInnerJoinStatsWithHist([]*expression.Column{t1Child.schema.Columns[0]}, []*expression.Column{t2Child.schema.Columns[0]})
c.Assert(err, IsNil)
c.Assert(finalStats.RowCount, Equals, float64(2))
c.Assert(len(finalStats.HistColl.Columns), Equals, 4)
ans1 := `column:0 ndv:2 totColSize:0
num: 1 lower_bound: 8 upper_bound: 8 repeats: 1
num: 1 lower_bound: 9 upper_bound: 9 repeats: 1`
c.Assert(finalStats.HistColl.Columns[1].String(), Equals, ans1)
ans2 := `column:2 ndv:9 totColSize:9
num: 1 lower_bound: 1 upper_bound: 1 repeats: 1
num: 1 lower_bound: 2 upper_bound: 2 repeats: 1
num: 1 lower_bound: 3 upper_bound: 3 repeats: 1
num: 1 lower_bound: 4 upper_bound: 4 repeats: 1
num: 1 lower_bound: 5 upper_bound: 5 repeats: 1
num: 1 lower_bound: 6 upper_bound: 6 repeats: 1
num: 1 lower_bound: 7 upper_bound: 7 repeats: 1
num: 1 lower_bound: 8 upper_bound: 8 repeats: 1
num: 1 lower_bound: 9 upper_bound: 9 repeats: 1`
c.Assert(finalStats.HistColl.Columns[2].String(), Equals, ans2)
c.Assert(finalStats.HistColl.Columns[3].String(), Equals, ans1)
ans4 := `column:2 ndv:8 totColSize:8
num: 1 lower_bound: 8 upper_bound: 8 repeats: 1
num: 1 lower_bound: 9 upper_bound: 9 repeats: 1
num: 1 lower_bound: 10 upper_bound: 10 repeats: 1
num: 1 lower_bound: 11 upper_bound: 11 repeats: 1
num: 1 lower_bound: 12 upper_bound: 12 repeats: 1
num: 1 lower_bound: 13 upper_bound: 13 repeats: 1
num: 1 lower_bound: 14 upper_bound: 14 repeats: 1
num: 1 lower_bound: 15 upper_bound: 15 repeats: 1`
c.Assert(finalStats.HistColl.Columns[4].String(), Equals, ans4)
t2StatsInfo.RowCount /= 2
t2StatsInfo.HistColl.Count /= 2
for i := range t2StatsInfo.Cardinality {
t2StatsInfo.Cardinality[i] /= 2
}
finalStats, err = join.deriveInnerJoinStatsWithHist([]*expression.Column{t1Child.schema.Columns[0]}, []*expression.Column{t2Child.schema.Columns[0]})
c.Assert(err, IsNil)
c.Assert(finalStats.RowCount, Equals, float64(1))
}
1 change: 1 addition & 0 deletions planner/core/testdata/t1.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions planner/core/testdata/t2.json

Large diffs are not rendered by default.

27 changes: 19 additions & 8 deletions statistics/dump.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,17 +171,28 @@ func (h *Handle) loadStatsFromJSON(tableInfo *model.TableInfo, physicalID int64,

// TableStatsFromJSON loads statistic from JSONTable and return the Table of statistic.
func TableStatsFromJSON(tableInfo *model.TableInfo, physicalID int64, jsonTbl *JSONTable) (*Table, error) {
newHistColl := HistColl{
coll, err := HistCollFromJSON(tableInfo, physicalID, jsonTbl)
if err != nil {
return nil, err
}

tbl := &Table{
HistColl: *coll,
}

return tbl, nil
}

// HistCollFromJSON build HistColl from JSONTable.
func HistCollFromJSON(tableInfo *model.TableInfo, physicalID int64, jsonTbl *JSONTable) (*HistColl, error) {
newHistColl := &HistColl{
PhysicalID: physicalID,
HavePhysicalID: true,
Count: jsonTbl.Count,
ModifyCount: jsonTbl.ModifyCount,
Columns: make(map[int64]*Column, len(jsonTbl.Columns)),
Indices: make(map[int64]*Index, len(jsonTbl.Indices)),
}
tbl := &Table{
HistColl: newHistColl,
}
for id, jsonIdx := range jsonTbl.Indices {
for _, idxInfo := range tableInfo.Indices {
if idxInfo.Name.L != id {
Expand All @@ -194,7 +205,7 @@ func TableStatsFromJSON(tableInfo *model.TableInfo, physicalID int64, jsonTbl *J
CMSketch: CMSketchFromProto(jsonIdx.CMSketch),
Info: idxInfo,
}
tbl.Indices[idx.ID] = idx
newHistColl.Indices[idx.ID] = idx
}
}

Expand All @@ -208,7 +219,7 @@ func TableStatsFromJSON(tableInfo *model.TableInfo, physicalID int64, jsonTbl *J
sc := &stmtctx.StatementContext{TimeZone: time.UTC}
hist, err := hist.ConvertTo(sc, &colInfo.FieldType)
if err != nil {
return nil, errors.Trace(err)
return nil, err
}
hist.ID, hist.NullCount, hist.LastUpdateVersion, hist.TotColSize = colInfo.ID, jsonCol.NullCount, jsonCol.LastUpdateVersion, jsonCol.TotColSize
col := &Column{
Expand All @@ -218,8 +229,8 @@ func TableStatsFromJSON(tableInfo *model.TableInfo, physicalID int64, jsonTbl *J
Count: count,
isHandle: tableInfo.PKIsHandle && mysql.HasPriKeyFlag(colInfo.Flag),
}
tbl.Columns[col.ID] = col
newHistColl.Columns[col.ID] = col
}
}
return tbl, nil
return newHistColl, nil
}
10 changes: 5 additions & 5 deletions statistics/feedback.go
Original file line number Diff line number Diff line change
Expand Up @@ -832,11 +832,11 @@ func (q *QueryFeedback) recalculateExpectCount(h *Handle) error {
if isIndex {
idx := t.Indices[id]
expected, err = idx.getRowCount(sc, ranges, t.ModifyCount)
expected *= idx.getIncreaseFactor(t.Count)
expected *= idx.GetIncreaseFactor(t.Count)
} else {
c := t.Columns[id]
expected, err = c.getColumnRowCount(sc, ranges, t.ModifyCount)
expected *= c.getIncreaseFactor(t.Count)
expected *= c.GetIncreaseFactor(t.Count)
}
if err != nil {
return errors.Trace(err)
Expand Down Expand Up @@ -981,13 +981,13 @@ func (q *QueryFeedback) logDetailedInfo(h *Handle) {
if idx == nil || idx.Histogram.Len() == 0 {
return
}
logForIndex(logPrefix, t, idx, ranges, actual, idx.getIncreaseFactor(t.Count))
logForIndex(logPrefix, t, idx, ranges, actual, idx.GetIncreaseFactor(t.Count))
} else {
c := t.Columns[q.hist.ID]
if c == nil || c.Histogram.Len() == 0 {
return
}
logForPK(logPrefix, c, ranges, actual, c.getIncreaseFactor(t.Count))
logForPK(logPrefix, c, ranges, actual, c.GetIncreaseFactor(t.Count))
}
}

Expand Down Expand Up @@ -1030,7 +1030,7 @@ func dumpFeedbackForIndex(h *Handle, q *QueryFeedback, t *Table) error {
log.Debug("encode keys failed: err", err)
continue
}
equalityCount := float64(idx.CMSketch.QueryBytes(bytes)) * idx.getIncreaseFactor(t.Count)
equalityCount := float64(idx.CMSketch.QueryBytes(bytes)) * idx.GetIncreaseFactor(t.Count)
rang := ranger.Range{
LowVal: []types.Datum{ran.LowVal[rangePosition]},
HighVal: []types.Datum{ran.HighVal[rangePosition]},
Expand Down
Loading