Skip to content

Commit

Permalink
collations: MySQL-compatible text collations
Browse files Browse the repository at this point in the history
Co-authored-by: Lakshya Singh <[email protected]>
Signed-off-by: Vicent Marti <[email protected]>
Signed-off-by: Lakshya Singh <[email protected]>
  • Loading branch information
vmg and king-11 committed Oct 15, 2021
1 parent ffe3574 commit 7c75c96
Show file tree
Hide file tree
Showing 58 changed files with 123,699 additions and 0 deletions.
148 changes: 148 additions & 0 deletions go/mysql/collations/8bit.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
/*
Copyright 2021 The Vitess Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package collations

func init() {
register(&Collation_binary{})
}

type simpletables struct {
tounicode []uint16
tolower []byte
toupper []byte
ctype []byte
sort []byte
}

type Collation_8bit_bin struct {
id uint
name string
simpletables
}

func (c *Collation_8bit_bin) init() {}

func (c *Collation_8bit_bin) Name() string {
return c.name
}
func (c *Collation_8bit_bin) Id() uint {
return c.id
}

func (c *Collation_8bit_bin) Collate(left, right []byte, rightIsPrefix bool) int {
return collationBinary(left, right, rightIsPrefix)
}

func (c *Collation_8bit_bin) WeightString(dst []byte, numCodepoints int, src []byte, padToMax bool) []byte {
copyCodepoints := minInt(len(src), minInt(cap(dst), numCodepoints))
dst = append(dst, src[:copyCodepoints]...)
return weightStringPadingSimple(' ', dst, numCodepoints-copyCodepoints, padToMax)
}

func (c *Collation_8bit_bin) WeightStringLen(numBytes int) int {
return numBytes
}

type Collation_8bit_simple_ci struct {
id uint
name string
simpletables
}

func (c *Collation_8bit_simple_ci) init() {}

func (c *Collation_8bit_simple_ci) Name() string {
return c.name
}

func (c *Collation_8bit_simple_ci) Id() uint {
return c.id
}

func (c *Collation_8bit_simple_ci) Collate(left, right []byte, rightIsPrefix bool) int {
sortOrder := c.sort
cmpLen := minInt(len(left), len(right))

for i := 0; i < cmpLen; i++ {
sortL, sortR := sortOrder[int(left[i])], sortOrder[int(right[i])]
if sortL != sortR {
return int(sortL) - int(sortR)
}
}
if rightIsPrefix {
left = left[:cmpLen]
}
return len(left) - len(right)
}

func (c *Collation_8bit_simple_ci) WeightString(dst []byte, numCodepoints int, src []byte, padToMax bool) []byte {
sortOrder := c.sort
copyCodepoints := minInt(len(src), minInt(cap(dst), numCodepoints))
for _, ch := range src[:copyCodepoints] {
dst = append(dst, sortOrder[int(ch)])
}
return weightStringPadingSimple(' ', dst, numCodepoints-copyCodepoints, padToMax)
}

func (c *Collation_8bit_simple_ci) WeightStringLen(numBytes int) int {
return numBytes
}

func weightStringPadingSimple(padChar byte, dst []byte, numCodepoints int, padToMax bool) []byte {
if padToMax {
for len(dst) < cap(dst) {
dst = append(dst, padChar)
}
} else {
for len(dst) < cap(dst) && numCodepoints > 0 {
dst = append(dst, padChar)
numCodepoints--
}
}
return dst
}

type Collation_binary struct{}

func (c *Collation_binary) init() {}

func (c *Collation_binary) Id() uint {
return 63
}

func (c *Collation_binary) Name() string {
return "binary"
}

func (c *Collation_binary) Collate(left, right []byte, isPrefix bool) int {
return collationBinary(left, right, isPrefix)
}

func (c *Collation_binary) WeightString(dst []byte, numCodepoints int, src []byte, padToMax bool) []byte {
copyCodepoints := minInt(len(src), minInt(cap(dst), numCodepoints))
dst = append(dst, src[:copyCodepoints]...)
if padToMax {
for len(dst) < cap(dst) {
dst = append(dst, 0x0)
}
}
return dst
}

func (c *Collation_binary) WeightStringLen(numBytes int) int {
return numBytes
}
88 changes: 88 additions & 0 deletions go/mysql/collations/charset/charset.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
package charset

import (
"errors"
"unicode/utf8"

"golang.org/x/text/encoding/unicode"
"golang.org/x/text/encoding/unicode/utf32"
)

const RuneError = utf8.RuneError

type UnicodeCharset int

const (
Charset_utf8mb4 UnicodeCharset = iota
Charset_utf8
Charset_utf16
Charset_ucs2
Charset_utf32
)

func (ch UnicodeCharset) SupportsSupplementaryChars() bool {
if ch == Charset_utf8 || ch == Charset_ucs2 {
return false
}
return true
}

type CodepointIterator func([]byte) (rune, int)

func (ch UnicodeCharset) Iterator() CodepointIterator {
switch ch {
case Charset_utf8mb4:
return utf8.DecodeRune
case Charset_utf8:
return iteratorUTF8mb3
case Charset_utf16:
return iteratorUTF16BE
case Charset_ucs2:
return iteratorUCS2
case Charset_utf32:
return iteratorUTF32
default:
panic("bad charset")
}
}

var defaultUTF16 = unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM)
var defaultUTF32 = utf32.UTF32(utf32.BigEndian, utf32.IgnoreBOM)
var errBMPRange = errors.New("input string contains characters outside of BMP range (cp > 0xFFFF)")

func ensureBMPRange(in []byte) error {
for _, cp := range string(in) {
if cp > 0xFFFF {
return errBMPRange
}
}
return nil
}

func (ch UnicodeCharset) EncodeFromUTF8(in []byte) ([]byte, error) {
switch ch {
case Charset_utf8mb4:
return in, nil

case Charset_utf8:
if err := ensureBMPRange(in); err != nil {
return nil, err
}
return in, nil

case Charset_utf16:
return defaultUTF16.NewEncoder().Bytes(in)

case Charset_ucs2:
if err := ensureBMPRange(in); err != nil {
return nil, err
}
return defaultUTF16.NewEncoder().Bytes(in)

case Charset_utf32:
return defaultUTF32.NewEncoder().Bytes(in)

default:
panic("bad charset")
}
}
40 changes: 40 additions & 0 deletions go/mysql/collations/charset/utf16.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package charset

func iteratorUTF16BE(b []byte) (rune, int) {
// 0xd800-0xdc00 encodes the high 10 bits of a pair.
// 0xdc00-0xe000 encodes the low 10 bits of a pair.
// the value is those 20 bits plus 0x10000.
const (
surr1 = 0xd800
surr2 = 0xdc00
surr3 = 0xe000
surrSelf = 0x10000
)

if len(b) < 2 {
return RuneError, 0
}

r1 := uint16(b[1]) | uint16(b[0])<<8
if r1 < surr1 || surr3 <= r1 {
return rune(r1), 2
}

if len(b) < 4 {
return RuneError, 0
}

r2 := uint16(b[3]) | uint16(b[2])<<8
if surr1 <= r1 && r1 < surr2 && surr2 <= r2 && r2 < surr3 {
return (rune(r1)-surr1)<<10 | (rune(r2) - surr2) + surrSelf, 4
}

return RuneError, 1
}

func iteratorUCS2(p []byte) (rune, int) {
if len(p) < 2 {
return RuneError, 0
}
return rune(p[0])<<8 | rune(p[1]), 2
}
8 changes: 8 additions & 0 deletions go/mysql/collations/charset/utf32.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
package charset

func iteratorUTF32(p []byte) (rune, int) {
if len(p) < 4 {
return RuneError, 0
}
return (rune(p[0]) << 24) | (rune(p[1]) << 16) | (rune(p[2]) << 8) | rune(p[3]), 4
}
96 changes: 96 additions & 0 deletions go/mysql/collations/charset/utf8.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
package charset

const (
maskx = 0b00111111
mask2 = 0b00011111
mask3 = 0b00001111

// The default lowest and highest continuation byte.
locb = 0b10000000
hicb = 0b10111111

// These names of these constants are chosen to give nice alignment in the
// table below. The first nibble is an index into acceptRanges or F for
// special one-byte cases. The second nibble is the Rune length or the
// Status for the special one-byte case.
xx = 0xF1 // invalid: size 1
as = 0xF0 // ASCII: size 1
s1 = 0x02 // accept 0, size 2
s2 = 0x13 // accept 1, size 3
s3 = 0x03 // accept 0, size 3
s4 = 0x23 // accept 2, size 3
)

// first is information about the first byte in a UTF-8 sequence.
var first = [256]uint8{
// 1 2 3 4 5 6 7 8 9 A B C D E F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
// 1 2 3 4 5 6 7 8 9 A B C D E F
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
}

// acceptRange gives the range of valid values for the second byte in a UTF-8
// sequence.
type acceptRange struct {
lo uint8 // lowest value for second byte.
hi uint8 // highest value for second byte.
}

// acceptRanges has size 16 to avoid bounds checks in the code that uses it.
var acceptRanges = [16]acceptRange{
0: {locb, hicb},
1: {0xA0, hicb},
2: {locb, 0x9F},
3: {0x90, hicb},
4: {locb, 0x8F},
}

func iteratorUTF8mb3(p []byte) (rune, int) {
n := len(p)
if n < 1 {
return RuneError, 0
}
p0 := p[0]
x := first[p0]
if x >= as {
// The following code simulates an additional check for x == xx and
// handling the ASCII and invalid cases accordingly. This mask-and-or
// approach prevents an additional branch.
mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF.
return rune(p[0])&^mask | RuneError&mask, 1
}
sz := int(x & 7)
accept := acceptRanges[x>>4]
if n < sz {
return RuneError, 1
}
b1 := p[1]
if b1 < accept.lo || accept.hi < b1 {
return RuneError, 1
}
if sz <= 2 { // <= instead of == to help the compiler eliminate some bounds checks
return rune(p0&mask2)<<6 | rune(b1&maskx), 2
}
b2 := p[2]
if b2 < locb || hicb < b2 {
return RuneError, 1
}
if sz <= 3 {
return rune(p0&mask3)<<12 | rune(b1&maskx)<<6 | rune(b2&maskx), 3
}
return RuneError, 1
}
Loading

0 comments on commit 7c75c96

Please sign in to comment.