collations: MySQL-compatible text collations

Co-authored-by: Lakshya Singh <[email protected]> Signed-off-by: Vicent Marti <[email protected]> Signed-off-by: Lakshya Singh <[email protected]>
vitessio · Oct 15, 2021 · 7c75c96 · 7c75c96
1 parent ffe3574
commit 7c75c96
Show file tree

Hide file tree

Showing 58 changed files with 123,699 additions and 0 deletions.
diff --git a/go/mysql/collations/8bit.go b/go/mysql/collations/8bit.go
@@ -0,0 +1,148 @@
+/*
+Copyright 2021 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package collations
+
+func init() {
+	register(&Collation_binary{})
+}
+
+type simpletables struct {
+	tounicode []uint16
+	tolower   []byte
+	toupper   []byte
+	ctype     []byte
+	sort      []byte
+}
+
+type Collation_8bit_bin struct {
+	id   uint
+	name string
+	simpletables
+}
+
+func (c *Collation_8bit_bin) init() {}
+
+func (c *Collation_8bit_bin) Name() string {
+	return c.name
+}
+func (c *Collation_8bit_bin) Id() uint {
+	return c.id
+}
+
+func (c *Collation_8bit_bin) Collate(left, right []byte, rightIsPrefix bool) int {
+	return collationBinary(left, right, rightIsPrefix)
+}
+
+func (c *Collation_8bit_bin) WeightString(dst []byte, numCodepoints int, src []byte, padToMax bool) []byte {
+	copyCodepoints := minInt(len(src), minInt(cap(dst), numCodepoints))
+	dst = append(dst, src[:copyCodepoints]...)
+	return weightStringPadingSimple(' ', dst, numCodepoints-copyCodepoints, padToMax)
+}
+
+func (c *Collation_8bit_bin) WeightStringLen(numBytes int) int {
+	return numBytes
+}
+
+type Collation_8bit_simple_ci struct {
+	id   uint
+	name string
+	simpletables
+}
+
+func (c *Collation_8bit_simple_ci) init() {}
+
+func (c *Collation_8bit_simple_ci) Name() string {
+	return c.name
+}
+
+func (c *Collation_8bit_simple_ci) Id() uint {
+	return c.id
+}
+
+func (c *Collation_8bit_simple_ci) Collate(left, right []byte, rightIsPrefix bool) int {
+	sortOrder := c.sort
+	cmpLen := minInt(len(left), len(right))
+
+	for i := 0; i < cmpLen; i++ {
+		sortL, sortR := sortOrder[int(left[i])], sortOrder[int(right[i])]
+		if sortL != sortR {
+			return int(sortL) - int(sortR)
+		}
+	}
+	if rightIsPrefix {
+		left = left[:cmpLen]
+	}
+	return len(left) - len(right)
+}
+
+func (c *Collation_8bit_simple_ci) WeightString(dst []byte, numCodepoints int, src []byte, padToMax bool) []byte {
+	sortOrder := c.sort
+	copyCodepoints := minInt(len(src), minInt(cap(dst), numCodepoints))
+	for _, ch := range src[:copyCodepoints] {
+		dst = append(dst, sortOrder[int(ch)])
+	}
+	return weightStringPadingSimple(' ', dst, numCodepoints-copyCodepoints, padToMax)
+}
+
+func (c *Collation_8bit_simple_ci) WeightStringLen(numBytes int) int {
+	return numBytes
+}
+
+func weightStringPadingSimple(padChar byte, dst []byte, numCodepoints int, padToMax bool) []byte {
+	if padToMax {
+		for len(dst) < cap(dst) {
+			dst = append(dst, padChar)
+		}
+	} else {
+		for len(dst) < cap(dst) && numCodepoints > 0 {
+			dst = append(dst, padChar)
+			numCodepoints--
+		}
+	}
+	return dst
+}
+
+type Collation_binary struct{}
+
+func (c *Collation_binary) init() {}
+
+func (c *Collation_binary) Id() uint {
+	return 63
+}
+
+func (c *Collation_binary) Name() string {
+	return "binary"
+}
+
+func (c *Collation_binary) Collate(left, right []byte, isPrefix bool) int {
+	return collationBinary(left, right, isPrefix)
+}
+
+func (c *Collation_binary) WeightString(dst []byte, numCodepoints int, src []byte, padToMax bool) []byte {
+	copyCodepoints := minInt(len(src), minInt(cap(dst), numCodepoints))
+	dst = append(dst, src[:copyCodepoints]...)
+	if padToMax {
+		for len(dst) < cap(dst) {
+			dst = append(dst, 0x0)
+		}
+	}
+	return dst
+}
+
+func (c *Collation_binary) WeightStringLen(numBytes int) int {
+	return numBytes
+}
diff --git a/go/mysql/collations/charset/charset.go b/go/mysql/collations/charset/charset.go
@@ -0,0 +1,88 @@
+package charset
+
+import (
+	"errors"
+	"unicode/utf8"
+
+	"golang.org/x/text/encoding/unicode"
+	"golang.org/x/text/encoding/unicode/utf32"
+)
+
+const RuneError = utf8.RuneError
+
+type UnicodeCharset int
+
+const (
+	Charset_utf8mb4 UnicodeCharset = iota
+	Charset_utf8
+	Charset_utf16
+	Charset_ucs2
+	Charset_utf32
+)
+
+func (ch UnicodeCharset) SupportsSupplementaryChars() bool {
+	if ch == Charset_utf8 || ch == Charset_ucs2 {
+		return false
+	}
+	return true
+}
+
+type CodepointIterator func([]byte) (rune, int)
+
+func (ch UnicodeCharset) Iterator() CodepointIterator {
+	switch ch {
+	case Charset_utf8mb4:
+		return utf8.DecodeRune
+	case Charset_utf8:
+		return iteratorUTF8mb3
+	case Charset_utf16:
+		return iteratorUTF16BE
+	case Charset_ucs2:
+		return iteratorUCS2
+	case Charset_utf32:
+		return iteratorUTF32
+	default:
+		panic("bad charset")
+	}
+}
+
+var defaultUTF16 = unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM)
+var defaultUTF32 = utf32.UTF32(utf32.BigEndian, utf32.IgnoreBOM)
+var errBMPRange = errors.New("input string contains characters outside of BMP range (cp > 0xFFFF)")
+
+func ensureBMPRange(in []byte) error {
+	for _, cp := range string(in) {
+		if cp > 0xFFFF {
+			return errBMPRange
+		}
+	}
+	return nil
+}
+
+func (ch UnicodeCharset) EncodeFromUTF8(in []byte) ([]byte, error) {
+	switch ch {
+	case Charset_utf8mb4:
+		return in, nil
+
+	case Charset_utf8:
+		if err := ensureBMPRange(in); err != nil {
+			return nil, err
+		}
+		return in, nil
+
+	case Charset_utf16:
+		return defaultUTF16.NewEncoder().Bytes(in)
+
+	case Charset_ucs2:
+		if err := ensureBMPRange(in); err != nil {
+			return nil, err
+		}
+		return defaultUTF16.NewEncoder().Bytes(in)
+
+	case Charset_utf32:
+		return defaultUTF32.NewEncoder().Bytes(in)
+
+	default:
+		panic("bad charset")
+	}
+}
diff --git a/go/mysql/collations/charset/utf16.go b/go/mysql/collations/charset/utf16.go
@@ -0,0 +1,40 @@
+package charset
+
+func iteratorUTF16BE(b []byte) (rune, int) {
+	// 0xd800-0xdc00 encodes the high 10 bits of a pair.
+	// 0xdc00-0xe000 encodes the low 10 bits of a pair.
+	// the value is those 20 bits plus 0x10000.
+	const (
+		surr1    = 0xd800
+		surr2    = 0xdc00
+		surr3    = 0xe000
+		surrSelf = 0x10000
+	)
+
+	if len(b) < 2 {
+		return RuneError, 0
+	}
+
+	r1 := uint16(b[1]) | uint16(b[0])<<8
+	if r1 < surr1 || surr3 <= r1 {
+		return rune(r1), 2
+	}
+
+	if len(b) < 4 {
+		return RuneError, 0
+	}
+
+	r2 := uint16(b[3]) | uint16(b[2])<<8
+	if surr1 <= r1 && r1 < surr2 && surr2 <= r2 && r2 < surr3 {
+		return (rune(r1)-surr1)<<10 | (rune(r2) - surr2) + surrSelf, 4
+	}
+
+	return RuneError, 1
+}
+
+func iteratorUCS2(p []byte) (rune, int) {
+	if len(p) < 2 {
+		return RuneError, 0
+	}
+	return rune(p[0])<<8 | rune(p[1]), 2
+}
diff --git a/go/mysql/collations/charset/utf32.go b/go/mysql/collations/charset/utf32.go
@@ -0,0 +1,8 @@
+package charset
+
+func iteratorUTF32(p []byte) (rune, int) {
+	if len(p) < 4 {
+		return RuneError, 0
+	}
+	return (rune(p[0]) << 24) | (rune(p[1]) << 16) | (rune(p[2]) << 8) | rune(p[3]), 4
+}
diff --git a/go/mysql/collations/charset/utf8.go b/go/mysql/collations/charset/utf8.go
@@ -0,0 +1,96 @@
+package charset
+
+const (
+	maskx = 0b00111111
+	mask2 = 0b00011111
+	mask3 = 0b00001111
+
+	// The default lowest and highest continuation byte.
+	locb = 0b10000000
+	hicb = 0b10111111
+
+	// These names of these constants are chosen to give nice alignment in the
+	// table below. The first nibble is an index into acceptRanges or F for
+	// special one-byte cases. The second nibble is the Rune length or the
+	// Status for the special one-byte case.
+	xx = 0xF1 // invalid: size 1
+	as = 0xF0 // ASCII: size 1
+	s1 = 0x02 // accept 0, size 2
+	s2 = 0x13 // accept 1, size 3
+	s3 = 0x03 // accept 0, size 3
+	s4 = 0x23 // accept 2, size 3
+)
+
+// first is information about the first byte in a UTF-8 sequence.
+var first = [256]uint8{
+	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
+	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
+	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
+	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
+	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
+	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
+	xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
+	s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
+	s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
+	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
+}
+
+// acceptRange gives the range of valid values for the second byte in a UTF-8
+// sequence.
+type acceptRange struct {
+	lo uint8 // lowest value for second byte.
+	hi uint8 // highest value for second byte.
+}
+
+// acceptRanges has size 16 to avoid bounds checks in the code that uses it.
+var acceptRanges = [16]acceptRange{
+	0: {locb, hicb},
+	1: {0xA0, hicb},
+	2: {locb, 0x9F},
+	3: {0x90, hicb},
+	4: {locb, 0x8F},
+}
+
+func iteratorUTF8mb3(p []byte) (rune, int) {
+	n := len(p)
+	if n < 1 {
+		return RuneError, 0
+	}
+	p0 := p[0]
+	x := first[p0]
+	if x >= as {
+		// The following code simulates an additional check for x == xx and
+		// handling the ASCII and invalid cases accordingly. This mask-and-or
+		// approach prevents an additional branch.
+		mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF.
+		return rune(p[0])&^mask | RuneError&mask, 1
+	}
+	sz := int(x & 7)
+	accept := acceptRanges[x>>4]
+	if n < sz {
+		return RuneError, 1
+	}
+	b1 := p[1]
+	if b1 < accept.lo || accept.hi < b1 {
+		return RuneError, 1
+	}
+	if sz <= 2 { // <= instead of == to help the compiler eliminate some bounds checks
+		return rune(p0&mask2)<<6 | rune(b1&maskx), 2
+	}
+	b2 := p[2]
+	if b2 < locb || hicb < b2 {
+		return RuneError, 1
+	}
+	if sz <= 3 {
+		return rune(p0&mask3)<<12 | rune(b1&maskx)<<6 | rune(b2&maskx), 3
+	}
+	return RuneError, 1
+}