From 159bd62f88f01681add9277733e4215a3e0cf6d9 Mon Sep 17 00:00:00 2001
From: Yahor Yuzefovich <yahor@cockroachlabs.com>
Date: Fri, 16 Sep 2022 16:52:22 -0400
Subject: [PATCH] stats: fix buckets for INT2 and INT4

Previously, if we needed to create "outer" histogram buckets (which is
the case when minimum and maximum values in the column weren't sampled
yet they contributed to the distinct count) for INT2 and INT4 types, we
would use the values that exceeded the supported range for those types.
This could lead to incorrect estimation later on when those "outer"
buckets are used during the costing as well as the histograms would need
to be manually edited to be injected. This is now fixed by handling
these two types separately.

Release note: None
---
 pkg/sql/logictest/testdata/logic_test/stats |  39 +++++++
 pkg/sql/stats/histogram.go                  | 116 +++++++++++++++-----
 pkg/sql/stats/histogram_test.go             |  16 ++-
 3 files changed, 139 insertions(+), 32 deletions(-)
 create mode 100644 pkg/sql/logictest/testdata/logic_test/stats

diff --git a/pkg/sql/logictest/testdata/logic_test/stats b/pkg/sql/logictest/testdata/logic_test/stats
new file mode 100644
index 000000000000..6d1fb6385b3b
--- /dev/null
+++ b/pkg/sql/logictest/testdata/logic_test/stats
@@ -0,0 +1,39 @@
+# LogicTest: !fakedist-disk
+
+# Note that we disable the "forced disk spilling" config because the histograms
+# are dropped if the stats collection reaches the memory budget limit.
+
+# Regression test for using values outside of the range supported by the column
+# type for the histogram buckets (#76887).
+statement ok
+CREATE TABLE t (c INT2);
+
+# Insert many values so that the boundary values are likely to not be sampled.
+# Splitting the INSERT statement into two such that negative values are inserted
+# later for some reason makes it more likely that "outer" histogram buckets will
+# be needed.
+statement ok
+INSERT INTO t SELECT generate_series(1, 10000);
+INSERT INTO t SELECT generate_series(-10000, 0);
+
+statement ok
+ANALYZE t;
+
+# Get the histogram ID for column 'c'.
+let $histogram_id
+WITH h(columns, id) AS
+  (SELECT column_names, histogram_id from [SHOW STATISTICS FOR TABLE t])
+SELECT id FROM h WHERE columns = ARRAY['c'];
+
+# Run a query that verifies that minimum and maximum values of the histogram
+# buckets are exactly the boundaries of the INT2 supported range (unless -10000
+# and 10000 values were sampled).
+query B
+SELECT CASE
+  WHEN (SELECT count(*) FROM [SHOW HISTOGRAM $histogram_id]) = 2
+    THEN true -- if the sampling picked the boundary values, we're happy
+  ELSE
+    (SELECT min(upper_bound::INT) = -32768 AND max(upper_bound::INT) = 32767 FROM [SHOW HISTOGRAM $histogram_id])
+  END
+----
+true
diff --git a/pkg/sql/stats/histogram.go b/pkg/sql/stats/histogram.go
index 65b52eefb28c..da0f91492b57 100644
--- a/pkg/sql/stats/histogram.go
+++ b/pkg/sql/stats/histogram.go
@@ -159,7 +159,7 @@ func EquiDepthHistogram(
 		lowerBound = getNextLowerBound(evalCtx, upper)
 	}
 
-	h.adjustCounts(evalCtx, float64(numRows), float64(distinctCount))
+	h.adjustCounts(evalCtx, colType, float64(numRows), float64(distinctCount))
 	histogramData, err := h.toHistogramData(colType)
 	return histogramData, h.buckets, err
 }
@@ -171,7 +171,7 @@ type histogram struct {
 // adjustCounts adjusts the row count and number of distinct values per bucket
 // based on the total row count and estimated distinct count.
 func (h *histogram) adjustCounts(
-	evalCtx *tree.EvalContext, rowCountTotal, distinctCountTotal float64,
+	evalCtx *tree.EvalContext, colType *types.T, rowCountTotal, distinctCountTotal float64,
 ) {
 	// Calculate the current state of the histogram so we can adjust it as needed.
 	// The number of rows and distinct values represented by the histogram should
@@ -274,7 +274,7 @@ func (h *histogram) adjustCounts(
 	remDistinctCount = distinctCountTotal - distinctCountRange - distinctCountEq
 	if remDistinctCount > 0 {
 		h.addOuterBuckets(
-			evalCtx, remDistinctCount, &rowCountEq, &distinctCountEq, &rowCountRange, &distinctCountRange,
+			evalCtx, colType, remDistinctCount, &rowCountEq, &distinctCountEq, &rowCountRange, &distinctCountRange,
 		)
 	}
 
@@ -291,39 +291,102 @@ func (h *histogram) adjustCounts(
 	}
 }
 
+// getMinVal returns the minimum value for the minimum "outer" bucket if the
+// value exists. The boolean indicates whether it exists and the bucket needs to
+// be created.
+func getMinVal(upperBound tree.Datum, t *types.T, evalCtx *tree.EvalContext) (tree.Datum, bool) {
+	if t.Family() == types.IntFamily {
+		// INT2 and INT4 require special handling.
+		// TODO(yuzefovich): other types might need it too, but it's less
+		// pressing to fix that.
+		bound, ok := upperBound.(*tree.DInt)
+		if !ok {
+			// This shouldn't happen, but we want to be defensive.
+			return nil, false
+		}
+		i := int64(*bound)
+		switch t.Width() {
+		case 16:
+			if i <= math.MinInt16 { // use inequality to be conservative
+				return nil, false
+			}
+			return tree.NewDInt(tree.DInt(math.MinInt16)), true
+		case 32:
+			if i <= math.MinInt32 { // use inequality to be conservative
+				return nil, false
+			}
+			return tree.NewDInt(tree.DInt(math.MinInt32)), true
+		}
+	}
+	if upperBound.IsMin(evalCtx) {
+		return nil, false
+	}
+	return upperBound.Min(evalCtx)
+}
+
+// getMaxVal returns the maximum value for the maximum "outer" bucket if the
+// value exists. The boolean indicates whether it exists and the bucket needs to
+// be created.
+func getMaxVal(upperBound tree.Datum, t *types.T, evalCtx *tree.EvalContext) (tree.Datum, bool) {
+	if t.Family() == types.IntFamily {
+		// INT2 and INT4 require special handling.
+		// TODO(yuzefovich): other types might need it too, but it's less
+		// pressing to fix that.
+		bound, ok := upperBound.(*tree.DInt)
+		if !ok {
+			// This shouldn't happen, but we want to be defensive.
+			return nil, false
+		}
+		i := int64(*bound)
+		switch t.Width() {
+		case 16:
+			if i >= math.MaxInt16 { // use inequality to be conservative
+				return nil, false
+			}
+			return tree.NewDInt(tree.DInt(math.MaxInt16)), true
+		case 32:
+			if i >= math.MaxInt32 { // use inequality to be conservative
+				return nil, false
+			}
+			return tree.NewDInt(tree.DInt(math.MaxInt32)), true
+		}
+	}
+	if upperBound.IsMax(evalCtx) {
+		return nil, false
+	}
+	return upperBound.Max(evalCtx)
+}
+
 // addOuterBuckets adds buckets above and below the existing buckets in the
 // histogram to include the remaining distinct values in remDistinctCount. It
 // also increments the counters rowCountEq, distinctCountEq, rowCountRange, and
 // distinctCountRange as needed.
 func (h *histogram) addOuterBuckets(
 	evalCtx *tree.EvalContext,
+	colType *types.T,
 	remDistinctCount float64,
 	rowCountEq, distinctCountEq, rowCountRange, distinctCountRange *float64,
 ) {
 	var maxDistinctCountExtraBuckets float64
 	var addedMin, addedMax bool
 	var newBuckets int
-	if !h.buckets[0].UpperBound.IsMin(evalCtx) {
-		if minVal, ok := h.buckets[0].UpperBound.Min(evalCtx); ok {
-			lowerBound := minVal
-			upperBound := h.buckets[0].UpperBound
-			maxDistRange, _ := maxDistinctRange(evalCtx, lowerBound, upperBound)
-			maxDistinctCountExtraBuckets += maxDistRange
-			h.buckets = append([]cat.HistogramBucket{{UpperBound: minVal}}, h.buckets...)
-			addedMin = true
-			newBuckets++
-		}
-	}
-	if !h.buckets[len(h.buckets)-1].UpperBound.IsMax(evalCtx) {
-		if maxVal, ok := h.buckets[len(h.buckets)-1].UpperBound.Max(evalCtx); ok {
-			lowerBound := h.buckets[len(h.buckets)-1].UpperBound
-			upperBound := maxVal
-			maxDistRange, _ := maxDistinctRange(evalCtx, lowerBound, upperBound)
-			maxDistinctCountExtraBuckets += maxDistRange
-			h.buckets = append(h.buckets, cat.HistogramBucket{UpperBound: maxVal})
-			addedMax = true
-			newBuckets++
-		}
+	if minVal, ok := getMinVal(h.buckets[0].UpperBound, colType, evalCtx); ok {
+		lowerBound := minVal
+		upperBound := h.buckets[0].UpperBound
+		maxDistRange, _ := maxDistinctRange(evalCtx, lowerBound, upperBound)
+		maxDistinctCountExtraBuckets += maxDistRange
+		h.buckets = append([]cat.HistogramBucket{{UpperBound: minVal}}, h.buckets...)
+		addedMin = true
+		newBuckets++
+	}
+	if maxVal, ok := getMaxVal(h.buckets[len(h.buckets)-1].UpperBound, colType, evalCtx); ok {
+		lowerBound := h.buckets[len(h.buckets)-1].UpperBound
+		upperBound := maxVal
+		maxDistRange, _ := maxDistinctRange(evalCtx, lowerBound, upperBound)
+		maxDistinctCountExtraBuckets += maxDistRange
+		h.buckets = append(h.buckets, cat.HistogramBucket{UpperBound: maxVal})
+		addedMax = true
+		newBuckets++
 	}
 
 	if newBuckets == 0 {
@@ -333,8 +396,7 @@ func (h *histogram) addOuterBuckets(
 
 	// If this is an enum or bool histogram, increment numEq for the upper
 	// bounds.
-	if typFam := h.buckets[0].UpperBound.ResolvedType().Family(); typFam == types.EnumFamily ||
-		typFam == types.BoolFamily {
+	if typFam := colType.Family(); typFam == types.EnumFamily || typFam == types.BoolFamily {
 		if addedMin {
 			h.buckets[0].NumEq++
 		}
@@ -367,7 +429,7 @@ func (h *histogram) addOuterBuckets(
 		maxDistRange, countable := maxDistinctRange(evalCtx, lowerBound, upperBound)
 
 		inc := avgRemPerBucket
-		if countable && h.buckets[0].UpperBound.ResolvedType().Family() == types.EnumFamily {
+		if countable && colType.Family() == types.EnumFamily {
 			// Set the increment proportional to the remaining number of
 			// distinct values in the bucket. This only really matters for
 			// enums.
diff --git a/pkg/sql/stats/histogram_test.go b/pkg/sql/stats/histogram_test.go
index f9b83331874a..48dda440a8e7 100644
--- a/pkg/sql/stats/histogram_test.go
+++ b/pkg/sql/stats/histogram_test.go
@@ -531,7 +531,11 @@ func TestAdjustCounts(t *testing.T) {
 		t.Run(fmt.Sprintf("%d", i), func(t *testing.T) {
 			actual := histogram{buckets: make([]cat.HistogramBucket, len(tc.h))}
 			copy(actual.buckets, tc.h)
-			actual.adjustCounts(&evalCtx, tc.rowCount, tc.distinctCount)
+			colType := types.Int
+			if len(tc.h) > 0 {
+				colType = tc.h[0].UpperBound.ResolvedType()
+			}
+			actual.adjustCounts(&evalCtx, colType, tc.rowCount, tc.distinctCount)
 			roundHistogram(&actual)
 			if !reflect.DeepEqual(actual.buckets, tc.expected) {
 				t.Fatalf("expected %v but found %v", tc.expected, actual.buckets)
@@ -541,7 +545,7 @@ func TestAdjustCounts(t *testing.T) {
 
 	t.Run("random", func(t *testing.T) {
 		// randHist returns a random histogram with anywhere from 1-200 buckets.
-		randHist := func() histogram {
+		randHist := func() (histogram, *types.T) {
 			numBuckets := rand.Intn(200) + 1
 			buckets := make([]cat.HistogramBucket, numBuckets)
 			ub := rand.Intn(100000000)
@@ -549,6 +553,7 @@ func TestAdjustCounts(t *testing.T) {
 			if rand.Intn(2) == 0 {
 				ub = -ub
 			}
+			colType := types.Int
 			buckets[0].UpperBound = tree.NewDInt(tree.DInt(ub))
 			buckets[0].NumEq = float64(rand.Intn(1000)) + 1
 			for i := 1; i < len(buckets); i++ {
@@ -561,17 +566,18 @@ func TestAdjustCounts(t *testing.T) {
 			}
 			// Half the time, use floats instead of ints.
 			if rand.Intn(2) == 0 {
+				colType = types.Float
 				for i := range buckets {
 					buckets[i].UpperBound = tree.NewDFloat(tree.DFloat(*buckets[i].UpperBound.(*tree.DInt)))
 				}
 			}
-			return histogram{buckets: buckets}
+			return histogram{buckets: buckets}, colType
 		}
 
 		// Create 100 random histograms, and check that we can correctly adjust the
 		// counts to match a random row count and distinct count.
 		for trial := 0; trial < 100; trial++ {
-			h := randHist()
+			h, colType := randHist()
 			rowCount := rand.Intn(1000000)
 			distinctCount := rand.Intn(rowCount + 1)
 
@@ -581,7 +587,7 @@ func TestAdjustCounts(t *testing.T) {
 			distinctCount = max(distinctCount, len(h.buckets))
 
 			// Adjust the counts in the histogram to match the provided counts.
-			h.adjustCounts(&evalCtx, float64(rowCount), float64(distinctCount))
+			h.adjustCounts(&evalCtx, colType, float64(rowCount), float64(distinctCount))
 
 			// Check that the resulting histogram is valid.
 			if h.buckets[0].NumRange > 0 || h.buckets[0].DistinctRange > 0 {