Skip to content

Commit

Permalink
sql/stats: support rowCountEq = 0 in histogram.adjustCounts
Browse files Browse the repository at this point in the history
The predicted histograms in statistics forecasts will often have buckets
with NumEq = 0, and some predicted histograms will have _all_ buckets
with NumEq = 0. This wasn't possible before forecasting, because the
histograms produced by `EquiDepthHistogram` never have any buckets with
NumEq = 0.

If `adjustCounts` is called on such a histogram, `rowCountEq` and
`distinctCountEq` will be zero. `adjustCounts` should still be able to
fix such a histogram to have sum(NumRange) = rowCountTotal and
sum(DistinctRange) = distinctCountTotal. This patch teaches
`adjustCounts` to handle these histograms.

(Similarly, predicted histograms could have all buckets with
NumRange = 0, but this is already possible for histograms produced by
`EquiDepthHistogram`, so `adjustCounts` already handles these.)

Also, add a few more comments to `adjustCounts`.

Assists: cockroachdb#79872

Release note: None
  • Loading branch information
michae2 committed Jun 6, 2022
1 parent 2fc45fc commit 5e5edf5
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 8 deletions.
30 changes: 22 additions & 8 deletions pkg/sql/stats/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,16 @@ type histogram struct {
}

// adjustCounts adjusts the row count and number of distinct values per bucket
// based on the total row count and estimated distinct count.
// to equal the total row count and estimated distinct count. The total row
// count and estimated distinct count should not include NULL values, and the
// histogram should not contain any buckets for NULL values.
func (h *histogram) adjustCounts(evalCtx *eval.Context, rowCountTotal, distinctCountTotal float64) {
// Empty table cases.
if rowCountTotal <= 0 || distinctCountTotal <= 0 {
h.buckets = make([]cat.HistogramBucket, 0)
return
}

// Calculate the current state of the histogram so we can adjust it as needed.
// The number of rows and distinct values represented by the histogram should
// be adjusted so they equal rowCountTotal and distinctCountTotal.
Expand All @@ -189,13 +197,16 @@ func (h *histogram) adjustCounts(evalCtx *eval.Context, rowCountTotal, distinctC
}
}

if rowCountEq <= 0 {
panic(errors.AssertionFailedf("expected a positive value for rowCountEq"))
// If the histogram only had empty buckets, we can't adjust it.
if rowCountRange+rowCountEq <= 0 || distinctCountRange+distinctCountEq <= 0 {
h.buckets = make([]cat.HistogramBucket, 0)
return
}

// If the upper bounds account for all distinct values (as estimated by the
// sketch), make the histogram consistent by clearing the ranges and adjusting
// the NumEq values to add up to the row count.
// the NumEq values to add up to the row count. This might be the case for
// low-cardinality types like BOOL and ENUM or other low-cardinality data.
if distinctCountEq >= distinctCountTotal {
adjustmentFactorNumEq := rowCountTotal / rowCountEq
for i := range h.buckets {
Expand All @@ -209,7 +220,7 @@ func (h *histogram) adjustCounts(evalCtx *eval.Context, rowCountTotal, distinctC
// The upper bounds do not account for all distinct values, so adjust the
// NumEq values if needed so they add up to less than the row count.
remDistinctCount := distinctCountTotal - distinctCountEq
if rowCountEq+remDistinctCount >= rowCountTotal {
if rowCountEq > 0 && rowCountEq+remDistinctCount > rowCountTotal {
targetRowCountEq := rowCountTotal - remDistinctCount
adjustmentFactorNumEq := targetRowCountEq / rowCountEq
for i := range h.buckets {
Expand All @@ -229,10 +240,10 @@ func (h *histogram) adjustCounts(evalCtx *eval.Context, rowCountTotal, distinctC
lowerBound := h.buckets[0].UpperBound
upperBound := h.buckets[len(h.buckets)-1].UpperBound
if maxDistinct, ok := tree.MaxDistinctCount(evalCtx, lowerBound, upperBound); ok {
// Subtract distinctCountEq to account for the upper bounds of the
// Subtract number of buckets to account for the upper bounds of the
// buckets, along with the current range distinct count which has already
// been accounted for.
maxDistinctCountRange = float64(maxDistinct) - distinctCountEq - distinctCountRange
maxDistinctCountRange = float64(maxDistinct) - float64(len(h.buckets)) - distinctCountRange
}

// Add distinct values into the histogram if there is space. Increment the
Expand Down Expand Up @@ -277,7 +288,10 @@ func (h *histogram) adjustCounts(evalCtx *eval.Context, rowCountTotal, distinctC
)
}

// Adjust the values so the row counts and distinct counts add up correctly.
// At this point rowCountRange + rowCountEq >= distinctCountTotal but not
// necessarily rowCountTotal, so we've accounted for all distinct values, and
// any additional rows we add will be duplicate values. We can spread the
// final adjustment proportionately across both NumRange and NumEq.
adjustmentFactorDistinctRange := float64(1)
if distinctCountRange > 0 {
adjustmentFactorDistinctRange = (distinctCountTotal - distinctCountEq) / distinctCountRange
Expand Down
36 changes: 36 additions & 0 deletions pkg/sql/stats/histogram_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,42 @@ func TestAdjustCounts(t *testing.T) {
{NumRange: 1551.19, NumEq: 3447.09, DistinctRange: 450, UpperBound: f(1000)},
},
},
{ // Zero rowCount and distinctCount.
h: []cat.HistogramBucket{
{NumRange: 0, NumEq: 1, DistinctRange: 0, UpperBound: f(1)},
},
rowCount: 0,
distinctCount: 0,
expected: []cat.HistogramBucket{},
},
{ // Empty initial histogram.
h: []cat.HistogramBucket{},
rowCount: 1000,
distinctCount: 1000,
expected: []cat.HistogramBucket{},
},
{ // Empty bucket in initial histogram.
h: []cat.HistogramBucket{
{NumRange: 0, NumEq: 0, DistinctRange: 0, UpperBound: f(1)},
},
rowCount: 99,
distinctCount: 99,
expected: []cat.HistogramBucket{},
},
{ // All zero NumEq.
h: []cat.HistogramBucket{
{NumRange: 0, NumEq: 0, DistinctRange: 0, UpperBound: f(1)},
{NumRange: 10, NumEq: 0, DistinctRange: 5, UpperBound: f(100)},
{NumRange: 10, NumEq: 0, DistinctRange: 10, UpperBound: f(200)},
},
rowCount: 100,
distinctCount: 60,
expected: []cat.HistogramBucket{
{NumRange: 0, NumEq: 0, DistinctRange: 0, UpperBound: f(1)},
{NumRange: 50, NumEq: 0, DistinctRange: 27.5, UpperBound: f(100)},
{NumRange: 50, NumEq: 0, DistinctRange: 32.5, UpperBound: f(200)},
},
},
}

evalCtx := eval.MakeTestingEvalContext(cluster.MakeTestingClusterSettings())
Expand Down

0 comments on commit 5e5edf5

Please sign in to comment.