Skip to content

Commit

Permalink
stats: fix buckets for INT2 and INT4
Browse files Browse the repository at this point in the history
Previously, if we needed to create "outer" histogram buckets (which is
the case when minimum and maximum values in the column weren't sampled
yet they contributed to the distinct count) for INT2 and INT4 types, we
would use the values that exceeded the supported range for those types.
This could lead to incorrect estimation later on when those "outer"
buckets are used during the costing as well as the histograms would need
to be manually edited to be injected. This is now fixed by handling
these two types separately.

Release note: None
  • Loading branch information
yuzefovich committed Sep 20, 2022
1 parent d336fb2 commit 9da6f98
Show file tree
Hide file tree
Showing 10 changed files with 186 additions and 33 deletions.
7 changes: 7 additions & 0 deletions pkg/ccl/logictestccl/tests/3node-tenant/generated_test.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

39 changes: 39 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/stats
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# LogicTest: !fakedist-disk

# Note that we disable the "forced disk spilling" config because the histograms
# are dropped if the stats collection reaches the memory budget limit.

# Regression test for using values outside of the range supported by the column
# type for the histogram buckets (#76887).
statement ok
CREATE TABLE t (c INT2);

# Insert many values so that the boundary values are likely to not be sampled.
# Splitting the INSERT statement into two such that negative values are inserted
# later for some reason makes it more likely that "outer" histogram buckets will
# be needed.
statement ok
INSERT INTO t SELECT generate_series(1, 10000);
INSERT INTO t SELECT generate_series(-10000, 0);

statement ok
ANALYZE t;

# Get the histogram ID for column 'c'.
let $histogram_id
WITH h(columns, id) AS
(SELECT column_names, histogram_id from [SHOW STATISTICS FOR TABLE t])
SELECT id FROM h WHERE columns = ARRAY['c'];

# Run a query that verifies that minimum and maximum values of the histogram
# buckets are exactly the boundaries of the INT2 supported range (unless -10000
# and 10000 values were sampled).
query B
SELECT CASE
WHEN (SELECT count(*) FROM [SHOW HISTOGRAM $histogram_id]) = 2
THEN true -- if the sampling picked the boundary values, we're happy
ELSE
(SELECT min(upper_bound::INT) = -32768 AND max(upper_bound::INT) = 32767 FROM [SHOW HISTOGRAM $histogram_id])
END
----
true
7 changes: 7 additions & 0 deletions pkg/sql/logictest/tests/fakedist-vec-off/generated_test.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions pkg/sql/logictest/tests/fakedist/generated_test.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions pkg/sql/logictest/tests/local-vec-off/generated_test.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions pkg/sql/logictest/tests/local/generated_test.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pkg/sql/stats/forecast.go
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ func forecastColumnStatistics(
// Now adjust for consistency. We don't use any session data for operations
// on upper bounds, so a nil *eval.Context works as our tree.CompareContext.
var compareCtx *eval.Context
hist.adjustCounts(compareCtx, nonNullRowCount, nonNullDistinctCount)
hist.adjustCounts(compareCtx, observed[0].HistogramData.ColumnType, nonNullRowCount, nonNullDistinctCount)

// Finally, convert back to HistogramData.
histData, err := hist.toHistogramData(observed[0].HistogramData.ColumnType)
Expand Down
120 changes: 93 additions & 27 deletions pkg/sql/stats/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ func EquiDepthHistogram(
lowerBound = getNextLowerBound(compareCtx, upper)
}

h.adjustCounts(compareCtx, float64(numRows), float64(distinctCount))
h.adjustCounts(compareCtx, colType, float64(numRows), float64(distinctCount))
histogramData, err := h.toHistogramData(colType)
return histogramData, h.buckets, err
}
Expand All @@ -185,7 +185,7 @@ type histogram struct {
// count and estimated distinct count should not include NULL values, and the
// histogram should not contain any buckets for NULL values.
func (h *histogram) adjustCounts(
compareCtx tree.CompareContext, rowCountTotal, distinctCountTotal float64,
compareCtx tree.CompareContext, colType *types.T, rowCountTotal, distinctCountTotal float64,
) {
// Empty table cases.
if rowCountTotal <= 0 || distinctCountTotal <= 0 {
Expand Down Expand Up @@ -298,7 +298,7 @@ func (h *histogram) adjustCounts(
remDistinctCount = distinctCountTotal - distinctCountRange - distinctCountEq
if remDistinctCount > 0 {
h.addOuterBuckets(
compareCtx, remDistinctCount, &rowCountEq, &distinctCountEq, &rowCountRange, &distinctCountRange,
compareCtx, colType, remDistinctCount, &rowCountEq, &distinctCountEq, &rowCountRange, &distinctCountRange,
)
}

Expand Down Expand Up @@ -344,39 +344,106 @@ func (h *histogram) removeZeroBuckets() {
h.buckets = h.buckets[:j]
}

// getMinVal returns the minimum value for the minimum "outer" bucket if the
// value exists. The boolean indicates whether it exists and the bucket needs to
// be created.
func getMinVal(
upperBound tree.Datum, t *types.T, compareCtx tree.CompareContext,
) (tree.Datum, bool) {
if t.Family() == types.IntFamily {
// INT2 and INT4 require special handling.
// TODO(yuzefovich): other types might need it too, but it's less
// pressing to fix that.
bound, ok := upperBound.(*tree.DInt)
if !ok {
// This shouldn't happen, but we want to be defensive.
return nil, false
}
i := int64(*bound)
switch t.Width() {
case 16:
if i <= math.MinInt16 { // use inequality to be conservative
return nil, false
}
return tree.NewDInt(tree.DInt(math.MinInt16)), true
case 32:
if i <= math.MinInt32 { // use inequality to be conservative
return nil, false
}
return tree.NewDInt(tree.DInt(math.MinInt32)), true
}
}
if upperBound.IsMin(compareCtx) {
return nil, false
}
return upperBound.Min(compareCtx)
}

// getMaxVal returns the maximum value for the maximum "outer" bucket if the
// value exists. The boolean indicates whether it exists and the bucket needs to
// be created.
func getMaxVal(
upperBound tree.Datum, t *types.T, compareCtx tree.CompareContext,
) (tree.Datum, bool) {
if t.Family() == types.IntFamily {
// INT2 and INT4 require special handling.
// TODO(yuzefovich): other types might need it too, but it's less
// pressing to fix that.
bound, ok := upperBound.(*tree.DInt)
if !ok {
// This shouldn't happen, but we want to be defensive.
return nil, false
}
i := int64(*bound)
switch t.Width() {
case 16:
if i >= math.MaxInt16 { // use inequality to be conservative
return nil, false
}
return tree.NewDInt(tree.DInt(math.MaxInt16)), true
case 32:
if i >= math.MaxInt32 { // use inequality to be conservative
return nil, false
}
return tree.NewDInt(tree.DInt(math.MaxInt32)), true
}
}
if upperBound.IsMax(compareCtx) {
return nil, false
}
return upperBound.Max(compareCtx)
}

// addOuterBuckets adds buckets above and below the existing buckets in the
// histogram to include the remaining distinct values in remDistinctCount. It
// also increments the counters rowCountEq, distinctCountEq, rowCountRange, and
// distinctCountRange as needed.
func (h *histogram) addOuterBuckets(
compareCtx tree.CompareContext,
colType *types.T,
remDistinctCount float64,
rowCountEq, distinctCountEq, rowCountRange, distinctCountRange *float64,
) {
var maxDistinctCountExtraBuckets float64
var addedMin, addedMax bool
var newBuckets int
if !h.buckets[0].UpperBound.IsMin(compareCtx) {
if minVal, ok := h.buckets[0].UpperBound.Min(compareCtx); ok {
lowerBound := minVal
upperBound := h.buckets[0].UpperBound
maxDistRange, _ := maxDistinctRange(compareCtx, lowerBound, upperBound)
maxDistinctCountExtraBuckets += maxDistRange
h.buckets = append([]cat.HistogramBucket{{UpperBound: minVal}}, h.buckets...)
addedMin = true
newBuckets++
}
}
if !h.buckets[len(h.buckets)-1].UpperBound.IsMax(compareCtx) {
if maxVal, ok := h.buckets[len(h.buckets)-1].UpperBound.Max(compareCtx); ok {
lowerBound := h.buckets[len(h.buckets)-1].UpperBound
upperBound := maxVal
maxDistRange, _ := maxDistinctRange(compareCtx, lowerBound, upperBound)
maxDistinctCountExtraBuckets += maxDistRange
h.buckets = append(h.buckets, cat.HistogramBucket{UpperBound: maxVal})
addedMax = true
newBuckets++
}
if minVal, ok := getMinVal(h.buckets[0].UpperBound, colType, compareCtx); ok {
lowerBound := minVal
upperBound := h.buckets[0].UpperBound
maxDistRange, _ := maxDistinctRange(compareCtx, lowerBound, upperBound)
maxDistinctCountExtraBuckets += maxDistRange
h.buckets = append([]cat.HistogramBucket{{UpperBound: minVal}}, h.buckets...)
addedMin = true
newBuckets++
}
if maxVal, ok := getMaxVal(h.buckets[len(h.buckets)-1].UpperBound, colType, compareCtx); ok {
lowerBound := h.buckets[len(h.buckets)-1].UpperBound
upperBound := maxVal
maxDistRange, _ := maxDistinctRange(compareCtx, lowerBound, upperBound)
maxDistinctCountExtraBuckets += maxDistRange
h.buckets = append(h.buckets, cat.HistogramBucket{UpperBound: maxVal})
addedMax = true
newBuckets++
}

if newBuckets == 0 {
Expand All @@ -386,8 +453,7 @@ func (h *histogram) addOuterBuckets(

// If this is an enum or bool histogram, increment numEq for the upper
// bounds.
if typFam := h.buckets[0].UpperBound.ResolvedType().Family(); typFam == types.EnumFamily ||
typFam == types.BoolFamily {
if typFam := colType.Family(); typFam == types.EnumFamily || typFam == types.BoolFamily {
if addedMin {
h.buckets[0].NumEq++
}
Expand Down Expand Up @@ -420,7 +486,7 @@ func (h *histogram) addOuterBuckets(
maxDistRange, countable := maxDistinctRange(compareCtx, lowerBound, upperBound)

inc := avgRemPerBucket
if countable && h.buckets[0].UpperBound.ResolvedType().Family() == types.EnumFamily {
if countable && colType.Family() == types.EnumFamily {
// Set the increment proportional to the remaining number of
// distinct values in the bucket. This only really matters for
// enums.
Expand Down
16 changes: 11 additions & 5 deletions pkg/sql/stats/histogram_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -691,7 +691,11 @@ func TestAdjustCounts(t *testing.T) {
t.Run(fmt.Sprintf("%d", i), func(t *testing.T) {
actual := histogram{buckets: make([]cat.HistogramBucket, len(tc.h))}
copy(actual.buckets, tc.h)
actual.adjustCounts(&evalCtx, tc.rowCount, tc.distinctCount)
colType := types.Int
if len(tc.h) > 0 {
colType = tc.h[0].UpperBound.ResolvedType()
}
actual.adjustCounts(&evalCtx, colType, tc.rowCount, tc.distinctCount)
roundHistogram(&actual)
if !reflect.DeepEqual(actual.buckets, tc.expected) {
t.Fatalf("expected %v but found %v", tc.expected, actual.buckets)
Expand All @@ -701,14 +705,15 @@ func TestAdjustCounts(t *testing.T) {

t.Run("random", func(t *testing.T) {
// randHist returns a random histogram with anywhere from 1-200 buckets.
randHist := func() histogram {
randHist := func() (histogram, *types.T) {
numBuckets := rand.Intn(200) + 1
buckets := make([]cat.HistogramBucket, numBuckets)
ub := rand.Intn(100000000)
// Half the time, make it negative.
if rand.Intn(2) == 0 {
ub = -ub
}
colType := types.Int
buckets[0].UpperBound = tree.NewDInt(tree.DInt(ub))
buckets[0].NumEq = float64(rand.Intn(1000)) + 1
for i := 1; i < len(buckets); i++ {
Expand All @@ -721,17 +726,18 @@ func TestAdjustCounts(t *testing.T) {
}
// Half the time, use floats instead of ints.
if rand.Intn(2) == 0 {
colType = types.Float
for i := range buckets {
buckets[i].UpperBound = tree.NewDFloat(tree.DFloat(*buckets[i].UpperBound.(*tree.DInt)))
}
}
return histogram{buckets: buckets}
return histogram{buckets: buckets}, colType
}

// Create 100 random histograms, and check that we can correctly adjust the
// counts to match a random row count and distinct count.
for trial := 0; trial < 100; trial++ {
h := randHist()
h, colType := randHist()
rowCount := rand.Intn(1000000)
distinctCount := rand.Intn(rowCount + 1)

Expand All @@ -741,7 +747,7 @@ func TestAdjustCounts(t *testing.T) {
distinctCount = max(distinctCount, len(h.buckets))

// Adjust the counts in the histogram to match the provided counts.
h.adjustCounts(&evalCtx, float64(rowCount), float64(distinctCount))
h.adjustCounts(&evalCtx, colType, float64(rowCount), float64(distinctCount))

// Check that the resulting histogram is valid.
if h.buckets[0].NumRange > 0 || h.buckets[0].DistinctRange > 0 {
Expand Down

0 comments on commit 9da6f98

Please sign in to comment.