-
Notifications
You must be signed in to change notification settings - Fork 5.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
planner, statistics: maintain histogram for inner join #8097
Changes from 15 commits
8fe06a6
5347dc4
559031f
13ea82d
cea7095
b2ff7c1
2adb8dd
ea60d70
8a9ac4f
325c13d
d9a25d1
0579e83
6211577
44c6cb7
41b5a4b
c3befc4
1c80d94
686c50c
e1cecd5
15485aa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -281,6 +281,9 @@ func (p *LogicalJoin) DeriveStats(childStats []*property.StatsInfo) (*property.S | |
leftKeys = append(leftKeys, eqCond.GetArgs()[0].(*expression.Column)) | ||
rightKeys = append(rightKeys, eqCond.GetArgs()[1].(*expression.Column)) | ||
} | ||
if p.JoinType == InnerJoin && p.ctx.GetSessionVars().OptimizerSelectivityLevel >= 1 { | ||
return p.deriveInnerJoinStatsWithHist(leftKeys, rightKeys) | ||
} | ||
leftKeyCardinality := getCardinality(leftKeys, p.children[0].Schema(), leftProfile) | ||
rightKeyCardinality := getCardinality(rightKeys, p.children[1].Schema(), rightProfile) | ||
count := leftProfile.RowCount * rightProfile.RowCount / math.Max(leftKeyCardinality, rightKeyCardinality) | ||
|
@@ -302,6 +305,83 @@ func (p *LogicalJoin) DeriveStats(childStats []*property.StatsInfo) (*property.S | |
return p.stats, nil | ||
} | ||
|
||
func (p *LogicalJoin) deriveInnerJoinStatsWithHist(leftKeys, rightKeys []*expression.Column) (*property.StatsInfo, error) { | ||
leftChild, rightChild := p.children[0], p.children[1] | ||
leftProfile, rightProfile := leftChild.statsInfo(), rightChild.statsInfo() | ||
|
||
cardinality := make([]float64, 0, p.schema.Len()) | ||
cardinality = append(cardinality, leftProfile.Cardinality...) | ||
cardinality = append(cardinality, rightProfile.Cardinality...) | ||
|
||
ndv, leftNdv, rightNdv := float64(1), float64(1), float64(1) | ||
newColID2Hist := make(map[int64]*statistics.Column) | ||
|
||
// TODO: Support using index histogram to calculate the NDV after join and the final row count. | ||
for i := range leftKeys { | ||
leftHist, ok1 := leftChild.statsInfo().HistColl.Columns[leftKeys[i].UniqueID] | ||
rightHist, ok2 := rightChild.statsInfo().HistColl.Columns[rightKeys[i].UniqueID] | ||
lPos := leftChild.Schema().ColumnIndex(leftKeys[i]) | ||
rPos := rightChild.Schema().ColumnIndex(rightKeys[i]) | ||
leftNdv *= leftProfile.Cardinality[lPos] | ||
rightNdv *= rightProfile.Cardinality[rPos] | ||
if ok1 && ok2 { | ||
eurekaka marked this conversation as resolved.
Show resolved
Hide resolved
|
||
newHist := statistics.MergeHistogramForInnerJoin(&leftHist.Histogram, &rightHist.Histogram, leftKeys[i].RetType) | ||
leftCol := &statistics.Column{Info: leftHist.Info, Histogram: *newHist} | ||
rightCol := &statistics.Column{Info: rightHist.Info, Histogram: *newHist} | ||
lIncreaseFactor := leftHist.GetIncreaseFactor(leftChild.statsInfo().HistColl.Count) | ||
// The factor is used to scale the NDV. When it's higher than one. NDV doesn't need to be changed. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why don't we change NDV if this factor is larger than one? |
||
if lIncreaseFactor > 1 { | ||
lIncreaseFactor = 1 | ||
} | ||
rIncreaseFactor := rightHist.GetIncreaseFactor(rightChild.statsInfo().HistColl.Count) | ||
if rIncreaseFactor > 1 { | ||
rIncreaseFactor = 1 | ||
} | ||
ndv *= float64(newHist.NDV) * lIncreaseFactor * rIncreaseFactor | ||
lPosNew := p.schema.ColumnIndex(leftKeys[i]) | ||
rPosNew := p.schema.ColumnIndex(rightKeys[i]) | ||
cardinality[lPosNew] = float64(newHist.NDV) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not multiply this cardinality by |
||
cardinality[rPosNew] = float64(newHist.NDV) | ||
newColID2Hist[leftKeys[i].UniqueID] = leftCol | ||
newColID2Hist[rightKeys[i].UniqueID] = rightCol | ||
continue | ||
} | ||
keyNdv := math.Min(leftChild.statsInfo().Cardinality[lPos], rightChild.statsInfo().Cardinality[rPos]) | ||
ndv *= keyNdv | ||
} | ||
count := leftProfile.RowCount / leftNdv * rightProfile.RowCount / rightNdv * ndv | ||
|
||
// Update left column map in `HistColl`. | ||
for uniqID, colHist := range leftProfile.HistColl.Columns { | ||
_, ok := newColID2Hist[uniqID] | ||
if ok { | ||
continue | ||
} | ||
newColID2Hist[uniqID] = colHist | ||
winoros marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
// Update right column map in `HistColl`. | ||
for uniqID, colHist := range rightProfile.HistColl.Columns { | ||
_, ok := newColID2Hist[uniqID] | ||
if ok { | ||
continue | ||
} | ||
newColID2Hist[uniqID] = colHist | ||
} | ||
eurekaka marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
// TODO: support calculate index histogram. | ||
newHistColl := &statistics.HistColl{ | ||
Count: int64(count), | ||
Columns: newColID2Hist, | ||
} | ||
p.stats = &property.StatsInfo{ | ||
RowCount: count, | ||
Cardinality: cardinality, | ||
HistColl: newHistColl, | ||
} | ||
return p.stats, nil | ||
} | ||
|
||
// DeriveStats implement LogicalPlan DeriveStats interface. | ||
func (la *LogicalApply) DeriveStats(childStats []*property.StatsInfo) (*property.StatsInfo, error) { | ||
leftProfile := childStats[0] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
// Copyright 2018 PingCAP, Inc. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
package core | ||
|
||
import ( | ||
"encoding/json" | ||
"io/ioutil" | ||
"path/filepath" | ||
|
||
. "github.com/pingcap/check" | ||
"github.com/pingcap/errors" | ||
"github.com/pingcap/parser" | ||
"github.com/pingcap/parser/ast" | ||
"github.com/pingcap/parser/model" | ||
"github.com/pingcap/tidb/ddl" | ||
"github.com/pingcap/tidb/expression" | ||
"github.com/pingcap/tidb/planner/property" | ||
"github.com/pingcap/tidb/sessionctx" | ||
"github.com/pingcap/tidb/statistics" | ||
"github.com/pingcap/tidb/util/mock" | ||
) | ||
|
||
var _ = Suite(&testStatsSuite{}) | ||
|
||
type testStatsSuite struct { | ||
ctx sessionctx.Context | ||
} | ||
|
||
func (s *testStatsSuite) SetUpTest(c *C) { | ||
s.ctx = mock.NewContext() | ||
} | ||
|
||
func (s *testStatsSuite) loadStatsByFileName(t string, tbl *model.TableInfo) (*statistics.HistColl, error) { | ||
statsTbl := &statistics.JSONTable{} | ||
path := filepath.Join("testdata", t) | ||
bytes, err := ioutil.ReadFile(path) | ||
if err != nil { | ||
return nil, errors.Trace(err) | ||
} | ||
err = json.Unmarshal(bytes, statsTbl) | ||
if err != nil { | ||
return nil, errors.Trace(err) | ||
} | ||
|
||
coll, err := statistics.HistCollFromJSON(tbl, tbl.ID, statsTbl) | ||
|
||
return coll, err | ||
} | ||
|
||
func (s *testStatsSuite) mockTableInfo(sql string, tblID int64) (*model.TableInfo, error) { | ||
stmt, err := parser.New().ParseOneStmt(sql, "", "") | ||
if err != nil { | ||
return nil, errors.Trace(err) | ||
} | ||
mockTbl, err := ddl.MockTableInfo(mock.NewContext(), stmt.(*ast.CreateTableStmt), tblID) | ||
return mockTbl, err | ||
} | ||
|
||
func (s *testStatsSuite) TestInnerJoinStats(c *C) { | ||
// t1's values are (i, i) where i = 1..9. Each pair appears once. | ||
t1Tbl, err := s.mockTableInfo("create table t1(a int, b int, index a(a, b))", 1) | ||
c.Assert(err, IsNil) | ||
t1Coll, err := s.loadStatsByFileName("t1.json", t1Tbl) | ||
alivxxx marked this conversation as resolved.
Show resolved
Hide resolved
|
||
c.Assert(err, IsNil) | ||
t1ExprCols := expression.ColumnInfos2ColumnsWithDBName(s.ctx, model.NewCIStr("test"), t1Tbl.Name, t1Tbl.Columns) | ||
t1FinalColl := t1Coll.GenerateHistCollFromColumnInfo(t1Tbl.Columns, t1ExprCols) | ||
t1StatsInfo := &property.StatsInfo{ | ||
RowCount: float64(t1Coll.Count), | ||
Cardinality: make([]float64, len(t1ExprCols)), | ||
HistColl: t1FinalColl, | ||
} | ||
for i := range t1ExprCols { | ||
t1StatsInfo.Cardinality[i] = float64(t1FinalColl.Columns[t1ExprCols[i].UniqueID].NDV) | ||
} | ||
t1Child := DataSource{}.Init(s.ctx) | ||
t1Child.schema = expression.NewSchema(t1ExprCols...) | ||
t1Child.stats = t1StatsInfo | ||
|
||
t2Tbl, err := s.mockTableInfo("create table t2(a int, b int, index a(a, b))", 2) | ||
winoros marked this conversation as resolved.
Show resolved
Hide resolved
|
||
c.Assert(err, IsNil) | ||
// t2's values are (i, i) where i = 8..15. Each pair appears once. | ||
t2Coll, err := s.loadStatsByFileName("t2.json", t2Tbl) | ||
c.Assert(err, IsNil) | ||
t2ExprCols := expression.ColumnInfos2ColumnsWithDBName(s.ctx, model.NewCIStr("test"), t2Tbl.Name, t2Tbl.Columns) | ||
t2FinalColl := t2Coll.GenerateHistCollFromColumnInfo(t2Tbl.Columns, t2ExprCols) | ||
t2StatsInfo := &property.StatsInfo{ | ||
RowCount: float64(t2Coll.Count), | ||
Cardinality: make([]float64, len(t2ExprCols)), | ||
HistColl: t2FinalColl, | ||
} | ||
for i := range t2ExprCols { | ||
t2StatsInfo.Cardinality[i] = float64(t2FinalColl.Columns[t2ExprCols[i].UniqueID].NDV) | ||
} | ||
t2Child := DataSource{}.Init(s.ctx) | ||
t2Child.schema = expression.NewSchema(t2ExprCols...) | ||
t2Child.stats = t2StatsInfo | ||
|
||
join := LogicalJoin{}.Init(s.ctx) | ||
join.SetChildren(t1Child, t2Child) | ||
join.schema = expression.MergeSchema(t1Child.schema, t2Child.schema) | ||
finalStats, err := join.deriveInnerJoinStatsWithHist([]*expression.Column{t1Child.schema.Columns[0]}, []*expression.Column{t2Child.schema.Columns[0]}) | ||
c.Assert(err, IsNil) | ||
c.Assert(finalStats.RowCount, Equals, float64(2)) | ||
c.Assert(len(finalStats.HistColl.Columns), Equals, 4) | ||
ans1 := `column:0 ndv:2 totColSize:0 | ||
num: 1 lower_bound: 8 upper_bound: 8 repeats: 1 | ||
num: 1 lower_bound: 9 upper_bound: 9 repeats: 1` | ||
c.Assert(finalStats.HistColl.Columns[1].String(), Equals, ans1) | ||
ans2 := `column:2 ndv:9 totColSize:9 | ||
num: 1 lower_bound: 1 upper_bound: 1 repeats: 1 | ||
num: 1 lower_bound: 2 upper_bound: 2 repeats: 1 | ||
num: 1 lower_bound: 3 upper_bound: 3 repeats: 1 | ||
num: 1 lower_bound: 4 upper_bound: 4 repeats: 1 | ||
num: 1 lower_bound: 5 upper_bound: 5 repeats: 1 | ||
num: 1 lower_bound: 6 upper_bound: 6 repeats: 1 | ||
num: 1 lower_bound: 7 upper_bound: 7 repeats: 1 | ||
num: 1 lower_bound: 8 upper_bound: 8 repeats: 1 | ||
num: 1 lower_bound: 9 upper_bound: 9 repeats: 1` | ||
c.Assert(finalStats.HistColl.Columns[2].String(), Equals, ans2) | ||
c.Assert(finalStats.HistColl.Columns[3].String(), Equals, ans1) | ||
ans4 := `column:2 ndv:8 totColSize:8 | ||
num: 1 lower_bound: 8 upper_bound: 8 repeats: 1 | ||
num: 1 lower_bound: 9 upper_bound: 9 repeats: 1 | ||
num: 1 lower_bound: 10 upper_bound: 10 repeats: 1 | ||
num: 1 lower_bound: 11 upper_bound: 11 repeats: 1 | ||
num: 1 lower_bound: 12 upper_bound: 12 repeats: 1 | ||
num: 1 lower_bound: 13 upper_bound: 13 repeats: 1 | ||
num: 1 lower_bound: 14 upper_bound: 14 repeats: 1 | ||
num: 1 lower_bound: 15 upper_bound: 15 repeats: 1` | ||
c.Assert(finalStats.HistColl.Columns[4].String(), Equals, ans4) | ||
t2StatsInfo.RowCount /= 2 | ||
t2StatsInfo.HistColl.Count /= 2 | ||
for i := range t2StatsInfo.Cardinality { | ||
t2StatsInfo.Cardinality[i] /= 2 | ||
} | ||
finalStats, err = join.deriveInnerJoinStatsWithHist([]*expression.Column{t1Child.schema.Columns[0]}, []*expression.Column{t2Child.schema.Columns[0]}) | ||
c.Assert(err, IsNil) | ||
c.Assert(finalStats.RowCount, Equals, float64(1)) | ||
} |
Large diffs are not rendered by default.
Large diffs are not rendered by default.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shall we pass the
childStats
parameter down toderiveInnerJoinStatsWithHist
? so thisDeriveStats
function can be used by cascades planner as well.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, using
childStats
in new commits. But it uses the schema of join's children inside this method. Seems that i need some way to not rely on it.