stats: fix buckets for INT2 and INT4

Previously, if we needed to create "outer" histogram buckets (which is the case when minimum and maximum values in the column weren't sampled yet they contributed to the distinct count) for INT2 and INT4 types, we would use the values that exceeded the supported range for those types. This could lead to incorrect estimation later on when those "outer" buckets are used during the costing as well as the histograms would need to be manually edited to be injected. This is now fixed by handling these two types separately. Release note: None
cockroachdb · Sep 20, 2022 · 9da6f98 · 9da6f98
1 parent d336fb2
commit 9da6f98
Show file tree

Hide file tree

Showing 10 changed files with 186 additions and 33 deletions.
diff --git a/pkg/ccl/logictestccl/tests/3node-tenant/generated_test.go b/pkg/ccl/logictestccl/tests/3node-tenant/generated_test.go
diff --git a/pkg/sql/logictest/testdata/logic_test/stats b/pkg/sql/logictest/testdata/logic_test/stats
@@ -0,0 +1,39 @@
+# LogicTest: !fakedist-disk
+
+# Note that we disable the "forced disk spilling" config because the histograms
+# are dropped if the stats collection reaches the memory budget limit.
+
+# Regression test for using values outside of the range supported by the column
+# type for the histogram buckets (#76887).
+statement ok
+CREATE TABLE t (c INT2);
+
+# Insert many values so that the boundary values are likely to not be sampled.
+# Splitting the INSERT statement into two such that negative values are inserted
+# later for some reason makes it more likely that "outer" histogram buckets will
+# be needed.
+statement ok
+INSERT INTO t SELECT generate_series(1, 10000);
+INSERT INTO t SELECT generate_series(-10000, 0);
+
+statement ok
+ANALYZE t;
+
+# Get the histogram ID for column 'c'.
+let $histogram_id
+WITH h(columns, id) AS
+  (SELECT column_names, histogram_id from [SHOW STATISTICS FOR TABLE t])
+SELECT id FROM h WHERE columns = ARRAY['c'];
+
+# Run a query that verifies that minimum and maximum values of the histogram
+# buckets are exactly the boundaries of the INT2 supported range (unless -10000
+# and 10000 values were sampled).
+query B
+SELECT CASE
+  WHEN (SELECT count(*) FROM [SHOW HISTOGRAM $histogram_id]) = 2
+    THEN true -- if the sampling picked the boundary values, we're happy
+  ELSE
+    (SELECT min(upper_bound::INT) = -32768 AND max(upper_bound::INT) = 32767 FROM [SHOW HISTOGRAM $histogram_id])
+  END
+----
+true
diff --git a/pkg/sql/logictest/tests/fakedist-vec-off/generated_test.go b/pkg/sql/logictest/tests/fakedist-vec-off/generated_test.go
diff --git a/pkg/sql/logictest/tests/fakedist/generated_test.go b/pkg/sql/logictest/tests/fakedist/generated_test.go
diff --git a/pkg/sql/logictest/tests/local-legacy-schema-changer/generated_test.go b/pkg/sql/logictest/tests/local-legacy-schema-changer/generated_test.go
diff --git a/pkg/sql/logictest/tests/local-vec-off/generated_test.go b/pkg/sql/logictest/tests/local-vec-off/generated_test.go
diff --git a/pkg/sql/logictest/tests/local/generated_test.go b/pkg/sql/logictest/tests/local/generated_test.go
diff --git a/pkg/sql/stats/forecast.go b/pkg/sql/stats/forecast.go
@@ -276,7 +276,7 @@ func forecastColumnStatistics(
 		// Now adjust for consistency. We don't use any session data for operations
 		// on upper bounds, so a nil *eval.Context works as our tree.CompareContext.
 		var compareCtx *eval.Context
-		hist.adjustCounts(compareCtx, nonNullRowCount, nonNullDistinctCount)
+		hist.adjustCounts(compareCtx, observed[0].HistogramData.ColumnType, nonNullRowCount, nonNullDistinctCount)
 
 		// Finally, convert back to HistogramData.
 		histData, err := hist.toHistogramData(observed[0].HistogramData.ColumnType)

diff --git a/pkg/sql/stats/histogram.go b/pkg/sql/stats/histogram.go
@@ -168,7 +168,7 @@ func EquiDepthHistogram(
 		lowerBound = getNextLowerBound(compareCtx, upper)
 	}
 
-	h.adjustCounts(compareCtx, float64(numRows), float64(distinctCount))
+	h.adjustCounts(compareCtx, colType, float64(numRows), float64(distinctCount))
 	histogramData, err := h.toHistogramData(colType)
 	return histogramData, h.buckets, err
 }
@@ -185,7 +185,7 @@ type histogram struct {
 // count and estimated distinct count should not include NULL values, and the
 // histogram should not contain any buckets for NULL values.
 func (h *histogram) adjustCounts(
-	compareCtx tree.CompareContext, rowCountTotal, distinctCountTotal float64,
+	compareCtx tree.CompareContext, colType *types.T, rowCountTotal, distinctCountTotal float64,
 ) {
 	// Empty table cases.
 	if rowCountTotal <= 0 || distinctCountTotal <= 0 {
@@ -298,7 +298,7 @@ func (h *histogram) adjustCounts(
 	remDistinctCount = distinctCountTotal - distinctCountRange - distinctCountEq
 	if remDistinctCount > 0 {
 		h.addOuterBuckets(
-			compareCtx, remDistinctCount, &rowCountEq, &distinctCountEq, &rowCountRange, &distinctCountRange,
+			compareCtx, colType, remDistinctCount, &rowCountEq, &distinctCountEq, &rowCountRange, &distinctCountRange,
 		)
 	}
 
@@ -344,39 +344,106 @@ func (h *histogram) removeZeroBuckets() {
 	h.buckets = h.buckets[:j]
 }
 
+// getMinVal returns the minimum value for the minimum "outer" bucket if the
+// value exists. The boolean indicates whether it exists and the bucket needs to
+// be created.
+func getMinVal(
+	upperBound tree.Datum, t *types.T, compareCtx tree.CompareContext,
+) (tree.Datum, bool) {
+	if t.Family() == types.IntFamily {
+		// INT2 and INT4 require special handling.
+		// TODO(yuzefovich): other types might need it too, but it's less
+		// pressing to fix that.
+		bound, ok := upperBound.(*tree.DInt)
+		if !ok {
+			// This shouldn't happen, but we want to be defensive.
+			return nil, false
+		}
+		i := int64(*bound)
+		switch t.Width() {
+		case 16:
+			if i <= math.MinInt16 { // use inequality to be conservative
+				return nil, false
+			}
+			return tree.NewDInt(tree.DInt(math.MinInt16)), true
+		case 32:
+			if i <= math.MinInt32 { // use inequality to be conservative
+				return nil, false
+			}
+			return tree.NewDInt(tree.DInt(math.MinInt32)), true
+		}
+	}
+	if upperBound.IsMin(compareCtx) {
+		return nil, false
+	}
+	return upperBound.Min(compareCtx)
+}
+
+// getMaxVal returns the maximum value for the maximum "outer" bucket if the
+// value exists. The boolean indicates whether it exists and the bucket needs to
+// be created.
+func getMaxVal(
+	upperBound tree.Datum, t *types.T, compareCtx tree.CompareContext,
+) (tree.Datum, bool) {
+	if t.Family() == types.IntFamily {
+		// INT2 and INT4 require special handling.
+		// TODO(yuzefovich): other types might need it too, but it's less
+		// pressing to fix that.
+		bound, ok := upperBound.(*tree.DInt)
+		if !ok {
+			// This shouldn't happen, but we want to be defensive.
+			return nil, false
+		}
+		i := int64(*bound)
+		switch t.Width() {
+		case 16:
+			if i >= math.MaxInt16 { // use inequality to be conservative
+				return nil, false
+			}
+			return tree.NewDInt(tree.DInt(math.MaxInt16)), true
+		case 32:
+			if i >= math.MaxInt32 { // use inequality to be conservative
+				return nil, false
+			}
+			return tree.NewDInt(tree.DInt(math.MaxInt32)), true
+		}
+	}
+	if upperBound.IsMax(compareCtx) {
+		return nil, false
+	}
+	return upperBound.Max(compareCtx)
+}
+
 // addOuterBuckets adds buckets above and below the existing buckets in the
 // histogram to include the remaining distinct values in remDistinctCount. It
 // also increments the counters rowCountEq, distinctCountEq, rowCountRange, and
 // distinctCountRange as needed.
 func (h *histogram) addOuterBuckets(
 	compareCtx tree.CompareContext,
+	colType *types.T,
 	remDistinctCount float64,
 	rowCountEq, distinctCountEq, rowCountRange, distinctCountRange *float64,
 ) {
 	var maxDistinctCountExtraBuckets float64
 	var addedMin, addedMax bool
 	var newBuckets int
-	if !h.buckets[0].UpperBound.IsMin(compareCtx) {
-		if minVal, ok := h.buckets[0].UpperBound.Min(compareCtx); ok {
-			lowerBound := minVal
-			upperBound := h.buckets[0].UpperBound
-			maxDistRange, _ := maxDistinctRange(compareCtx, lowerBound, upperBound)
-			maxDistinctCountExtraBuckets += maxDistRange
-			h.buckets = append([]cat.HistogramBucket{{UpperBound: minVal}}, h.buckets...)
-			addedMin = true
-			newBuckets++
-		}
-	}
-	if !h.buckets[len(h.buckets)-1].UpperBound.IsMax(compareCtx) {
-		if maxVal, ok := h.buckets[len(h.buckets)-1].UpperBound.Max(compareCtx); ok {
-			lowerBound := h.buckets[len(h.buckets)-1].UpperBound
-			upperBound := maxVal
-			maxDistRange, _ := maxDistinctRange(compareCtx, lowerBound, upperBound)
-			maxDistinctCountExtraBuckets += maxDistRange
-			h.buckets = append(h.buckets, cat.HistogramBucket{UpperBound: maxVal})
-			addedMax = true
-			newBuckets++
-		}
+	if minVal, ok := getMinVal(h.buckets[0].UpperBound, colType, compareCtx); ok {
+		lowerBound := minVal
+		upperBound := h.buckets[0].UpperBound
+		maxDistRange, _ := maxDistinctRange(compareCtx, lowerBound, upperBound)
+		maxDistinctCountExtraBuckets += maxDistRange
+		h.buckets = append([]cat.HistogramBucket{{UpperBound: minVal}}, h.buckets...)
+		addedMin = true
+		newBuckets++
+	}
+	if maxVal, ok := getMaxVal(h.buckets[len(h.buckets)-1].UpperBound, colType, compareCtx); ok {
+		lowerBound := h.buckets[len(h.buckets)-1].UpperBound
+		upperBound := maxVal
+		maxDistRange, _ := maxDistinctRange(compareCtx, lowerBound, upperBound)
+		maxDistinctCountExtraBuckets += maxDistRange
+		h.buckets = append(h.buckets, cat.HistogramBucket{UpperBound: maxVal})
+		addedMax = true
+		newBuckets++
 	}
 
 	if newBuckets == 0 {
@@ -386,8 +453,7 @@ func (h *histogram) addOuterBuckets(
 
 	// If this is an enum or bool histogram, increment numEq for the upper
 	// bounds.
-	if typFam := h.buckets[0].UpperBound.ResolvedType().Family(); typFam == types.EnumFamily ||
-		typFam == types.BoolFamily {
+	if typFam := colType.Family(); typFam == types.EnumFamily || typFam == types.BoolFamily {
 		if addedMin {
 			h.buckets[0].NumEq++
 		}
@@ -420,7 +486,7 @@ func (h *histogram) addOuterBuckets(
 		maxDistRange, countable := maxDistinctRange(compareCtx, lowerBound, upperBound)
 
 		inc := avgRemPerBucket
-		if countable && h.buckets[0].UpperBound.ResolvedType().Family() == types.EnumFamily {
+		if countable && colType.Family() == types.EnumFamily {
 			// Set the increment proportional to the remaining number of
 			// distinct values in the bucket. This only really matters for
 			// enums.

diff --git a/pkg/sql/stats/histogram_test.go b/pkg/sql/stats/histogram_test.go
@@ -691,7 +691,11 @@ func TestAdjustCounts(t *testing.T) {
 		t.Run(fmt.Sprintf("%d", i), func(t *testing.T) {
 			actual := histogram{buckets: make([]cat.HistogramBucket, len(tc.h))}
 			copy(actual.buckets, tc.h)
-			actual.adjustCounts(&evalCtx, tc.rowCount, tc.distinctCount)
+			colType := types.Int
+			if len(tc.h) > 0 {
+				colType = tc.h[0].UpperBound.ResolvedType()
+			}
+			actual.adjustCounts(&evalCtx, colType, tc.rowCount, tc.distinctCount)
 			roundHistogram(&actual)
 			if !reflect.DeepEqual(actual.buckets, tc.expected) {
 				t.Fatalf("expected %v but found %v", tc.expected, actual.buckets)
@@ -701,14 +705,15 @@ func TestAdjustCounts(t *testing.T) {
 
 	t.Run("random", func(t *testing.T) {
 		// randHist returns a random histogram with anywhere from 1-200 buckets.
-		randHist := func() histogram {
+		randHist := func() (histogram, *types.T) {
 			numBuckets := rand.Intn(200) + 1
 			buckets := make([]cat.HistogramBucket, numBuckets)
 			ub := rand.Intn(100000000)
 			// Half the time, make it negative.
 			if rand.Intn(2) == 0 {
 				ub = -ub
 			}
+			colType := types.Int
 			buckets[0].UpperBound = tree.NewDInt(tree.DInt(ub))
 			buckets[0].NumEq = float64(rand.Intn(1000)) + 1
 			for i := 1; i < len(buckets); i++ {
@@ -721,17 +726,18 @@ func TestAdjustCounts(t *testing.T) {
 			}
 			// Half the time, use floats instead of ints.
 			if rand.Intn(2) == 0 {
+				colType = types.Float
 				for i := range buckets {
 					buckets[i].UpperBound = tree.NewDFloat(tree.DFloat(*buckets[i].UpperBound.(*tree.DInt)))
 				}
 			}
-			return histogram{buckets: buckets}
+			return histogram{buckets: buckets}, colType
 		}
 
 		// Create 100 random histograms, and check that we can correctly adjust the
 		// counts to match a random row count and distinct count.
 		for trial := 0; trial < 100; trial++ {
-			h := randHist()
+			h, colType := randHist()
 			rowCount := rand.Intn(1000000)
 			distinctCount := rand.Intn(rowCount + 1)
 
@@ -741,7 +747,7 @@ func TestAdjustCounts(t *testing.T) {
 			distinctCount = max(distinctCount, len(h.buckets))
 
 			// Adjust the counts in the histogram to match the provided counts.
-			h.adjustCounts(&evalCtx, float64(rowCount), float64(distinctCount))
+			h.adjustCounts(&evalCtx, colType, float64(rowCount), float64(distinctCount))
 
 			// Check that the resulting histogram is valid.
 			if h.buckets[0].NumRange > 0 || h.buckets[0].DistinctRange > 0 {