Skip to content

Commit

Permalink
statistics: improve topn pruning logic (#34282)
Browse files Browse the repository at this point in the history
ref #32758, close #34256
  • Loading branch information
time-and-fate authored Apr 27, 2022
1 parent d22cd59 commit 6bd54be
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 5 deletions.
12 changes: 7 additions & 5 deletions statistics/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -375,8 +375,8 @@ func BuildHistAndTopN(
// pruneTopNItem tries to prune the least common values in the top-n list if it is not significantly more common than the values not in the list.
// We assume that the ones not in the top-n list's selectivity is 1/remained_ndv which is the internal implementation of EqualRowCount
func pruneTopNItem(topns []TopNMeta, ndv, nullCount, sampleRows, totalRows int64) []TopNMeta {
// If the sampleRows holds all rows. We just return the top-n directly.
if sampleRows == totalRows || totalRows <= 1 {
// If the sampleRows holds all rows, or NDV of samples equals to actual NDV, we just return the TopN directly.
if sampleRows == totalRows || totalRows <= 1 || int64(len(topns)) >= ndv {
return topns
}
// Sum the occurrence except the least common one from the top-n list. To check whether the lest common one is worth
Expand All @@ -396,7 +396,7 @@ func pruneTopNItem(topns []TopNMeta, ndv, nullCount, sampleRows, totalRows int64
if selectivity > 1 {
selectivity = 1
}
otherNDV := float64(ndv) - float64(topNNum)
otherNDV := float64(ndv) - (float64(topNNum) - 1)
if otherNDV > 1 {
selectivity /= otherNDV
}
Expand All @@ -407,11 +407,13 @@ func pruneTopNItem(topns []TopNMeta, ndv, nullCount, sampleRows, totalRows int64
// Thus the variance is the following formula.
variance := n * K * (N - K) * (N - n) / (N * N * (N - 1))
stddev := math.Sqrt(variance)
// We choose the bound that plus two stddev of the sample frequency plus an additional 0.5 for the continuity correction.
// We choose the bound that plus two stddev of the sample frequency, plus an additional 0.5 for the continuity correction.
// Note:
// The mean + 2 * stddev is known as Wald confidence interval, plus 0.5 would be continuity-corrected Wald interval
if float64(topns[topNNum-1].Count) > selectivity*n+2*stddev+0.5 {
// If the current one is worth storing, the latter ones too. So we just break here.
// Estimated selectivity of this item in the TopN is significantly higher than values not in TopN.
// So this value, and all other values in the TopN (selectivity of which is higher than this value) are
// worth being remained in the TopN list, and we stop pruning now.
break
}
// Current one is not worth storing, remove it and subtract it from sumCount, go to next one.
Expand Down
40 changes: 40 additions & 0 deletions statistics/statistics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -675,3 +675,43 @@ func SubTestHistogramProtoConversion() func(*testing.T) {
require.True(t, HistogramEqual(col, h, true))
}
}

func TestPruneTopN(t *testing.T) {
var topnIn, topnOut []TopNMeta
var totalNDV, nullCnt, sampleRows, totalRows int64

// case 1
topnIn = []TopNMeta{{[]byte{1}, 100_000}, {[]byte{2}, 10}}
totalNDV = 2
nullCnt = 0
sampleRows = 100_010
totalRows = 500_050
topnOut = pruneTopNItem(topnIn, totalNDV, nullCnt, sampleRows, totalRows)
require.Equal(t, topnIn, topnOut)

// case 2
topnIn = []TopNMeta{
{[]byte{1}, 30_000},
{[]byte{2}, 30_000},
{[]byte{3}, 20_000},
{[]byte{4}, 20_000},
}
totalNDV = 5
nullCnt = 0
sampleRows = 100_000
totalRows = 10_000_000
topnOut = pruneTopNItem(topnIn, totalNDV, nullCnt, sampleRows, totalRows)
require.Equal(t, topnIn, topnOut)

// case 3
topnIn = nil
for i := 0; i < 100; i++ {
topnIn = append(topnIn, TopNMeta{[]byte{byte(i)}, 1_000})
}
totalNDV = 100
nullCnt = 0
sampleRows = 100_000
totalRows = 10_000_000
topnOut = pruneTopNItem(topnIn, totalNDV, nullCnt, sampleRows, totalRows)
require.Equal(t, topnIn, topnOut)
}

0 comments on commit 6bd54be

Please sign in to comment.