From 9b0eb88b200586a69fd2db6bb65ea1b8f8e0ed45 Mon Sep 17 00:00:00 2001
From: Yiding Cui <winoros@gmail.com>
Date: Sat, 29 Dec 2018 19:48:15 +0800
Subject: [PATCH] util/ranger: fix incorrect behavior about index who has
 prefix column(#8851) (#8878)

---
 util/ranger/ranger.go      | 48 ++++++++++++++++++++++++++++++++------
 util/ranger/ranger_test.go | 28 ++++++++++++++++++++++
 2 files changed, 69 insertions(+), 7 deletions(-)

diff --git a/util/ranger/ranger.go b/util/ranger/ranger.go
index c7e8dce2a781f..8e514b6716071 100644
--- a/util/ranger/ranger.go
+++ b/util/ranger/ranger.go
@@ -330,7 +330,12 @@ func buildCNFIndexRange(sc *stmtctx.StatementContext, cols []*expression.Column,
 
 	// Take prefix index into consideration.
 	if hasPrefix(lengths) {
-		fixPrefixColRange(ranges, lengths, newTp)
+		if fixPrefixColRange(ranges, lengths, newTp) {
+			ranges, err = unionRanges(sc, ranges)
+			if err != nil {
+				return nil, errors.Trace(err)
+			}
+		}
 	}
 
 	return ranges, nil
@@ -397,20 +402,46 @@ func hasPrefix(lengths []int) bool {
 	return false
 }
 
-func fixPrefixColRange(ranges []*Range, lengths []int, tp []*types.FieldType) {
+// fixPrefixColRange checks whether the range of one column exceeds the length and needs to be cut.
+// It specially handles the last column of each range point. If the last one need to be cut, it will
+// change the exclude status of that point and return `true` to tell
+// that we need do a range merging since that interval may have intersection.
+// e.g. if the interval is (-inf -inf, a xxxxx), (a xxxxx, +inf +inf) and the length of the last column is 3,
+//      then we'll change it to (-inf -inf, a xxx], [a xxx, +inf +inf). You can see that this two interval intersect,
+//      so we need a merge operation.
+// Q: only checking the last column to decide whether the endpoint's exclude status needs to be reset is enough?
+// A: Yes, suppose that the interval is (-inf -inf, a xxxxx b) and only the second column needs to be cut.
+//    The result would be (-inf -inf, a xxx b) if the length of it is 3. Obviously we only need to care about the data
+//    whose the first two key is `a` and `xxx`. It read all data whose index value begins with `a` and `xxx` and the third
+//    value less than `b`, covering the values begin with `a` and `xxxxx` and the third value less than `b` perfectly.
+//    So in this case we don't need to reset its exclude status. The right endpoint case can be proved in the same way.
+func fixPrefixColRange(ranges []*Range, lengths []int, tp []*types.FieldType) bool {
+	hasCut := false
 	for _, ran := range ranges {
-		for i := 0; i < len(ran.LowVal); i++ {
+		lowTail := len(ran.LowVal) - 1
+		for i := 0; i < lowTail; i++ {
 			fixRangeDatum(&ran.LowVal[i], lengths[i], tp[i])
 		}
-		ran.LowExclude = false
-		for i := 0; i < len(ran.HighVal); i++ {
+		lowCut := false
+		lowCut = fixRangeDatum(&ran.LowVal[lowTail], lengths[lowTail], tp[lowTail])
+		if lowCut {
+			ran.LowExclude = false
+		}
+		highTail := len(ran.HighVal) - 1
+		for i := 0; i < highTail; i++ {
 			fixRangeDatum(&ran.HighVal[i], lengths[i], tp[i])
 		}
-		ran.HighExclude = false
+		highCut := false
+		highCut = fixRangeDatum(&ran.HighVal[highTail], lengths[highTail], tp[highTail])
+		if highCut {
+			ran.HighExclude = false
+		}
+		hasCut = lowCut || highCut
 	}
+	return hasCut
 }
 
-func fixRangeDatum(v *types.Datum, length int, tp *types.FieldType) {
+func fixRangeDatum(v *types.Datum, length int, tp *types.FieldType) bool {
 	// If this column is prefix and the prefix length is smaller than the range, cut it.
 	// In case of UTF8, prefix should be cut by characters rather than bytes
 	if v.Kind() == types.KindString || v.Kind() == types.KindBytes {
@@ -423,12 +454,15 @@ func fixRangeDatum(v *types.Datum, length int, tp *types.FieldType) {
 				truncateStr := string(rs[:length])
 				// truncate value and limit its length
 				v.SetString(truncateStr)
+				return true
 			}
 		} else if length != types.UnspecifiedLength && len(colValue) > length {
 			// truncate value and limit its length
 			v.SetBytes(colValue[:length])
+			return true
 		}
 	}
+	return false
 }
 
 // We cannot use the FieldType of column directly. e.g. the column a is int32 and we have a > 1111111111111111111.
diff --git a/util/ranger/ranger_test.go b/util/ranger/ranger_test.go
index 8d8ae3be3a825..a931de41ecec8 100644
--- a/util/ranger/ranger_test.go
+++ b/util/ranger/ranger_test.go
@@ -544,6 +544,34 @@ func (s *testRangerSuite) TestIndexRange(c *C) {
 			filterConds: "[eq(test.t.e, 你好啊)]",
 			resultStr:   "[[\"[228 189]\",\"[228 189]\"]]",
 		},
+		{
+			indexPos:    2,
+			exprStr:     `d in ("你好啊")`,
+			accessConds: "[in(test.t.d, 你好啊)]",
+			filterConds: "[in(test.t.d, 你好啊)]",
+			resultStr:   "[[\"你好\",\"你好\"]]",
+		},
+		{
+			indexPos:    2,
+			exprStr:     `d not in ("你好啊")`,
+			accessConds: "[not(in(test.t.d, 你好啊))]",
+			filterConds: "[not(in(test.t.d, 你好啊))]",
+			resultStr:   "[(NULL,+inf]]",
+		},
+		{
+			indexPos:    2,
+			exprStr:     `d < "你好" || d > "你好"`,
+			accessConds: "[or(lt(test.t.d, 你好), gt(test.t.d, 你好))]",
+			filterConds: "[or(lt(test.t.d, 你好), gt(test.t.d, 你好))]",
+			resultStr:   "[[-inf,\"你好\") (\"你好\",+inf]]",
+		},
+		{
+			indexPos:    2,
+			exprStr:     `not(d < "你好" || d > "你好")`,
+			accessConds: "[and(ge(test.t.d, 你好), le(test.t.d, 你好))]",
+			filterConds: "[and(ge(test.t.d, 你好), le(test.t.d, 你好))]",
+			resultStr:   "[[\"你好\",\"你好\"]]",
+		},
 	}
 
 	for _, tt := range tests {