-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
collations: MySQL-compatible text collations
Co-authored-by: Lakshya Singh <[email protected]> Signed-off-by: Vicent Marti <[email protected]> Signed-off-by: Lakshya Singh <[email protected]>
- Loading branch information
Showing
58 changed files
with
123,699 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
/* | ||
Copyright 2021 The Vitess Authors. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package collations | ||
|
||
func init() { | ||
register(&Collation_binary{}) | ||
} | ||
|
||
type simpletables struct { | ||
tounicode []uint16 | ||
tolower []byte | ||
toupper []byte | ||
ctype []byte | ||
sort []byte | ||
} | ||
|
||
type Collation_8bit_bin struct { | ||
id uint | ||
name string | ||
simpletables | ||
} | ||
|
||
func (c *Collation_8bit_bin) init() {} | ||
|
||
func (c *Collation_8bit_bin) Name() string { | ||
return c.name | ||
} | ||
func (c *Collation_8bit_bin) Id() uint { | ||
return c.id | ||
} | ||
|
||
func (c *Collation_8bit_bin) Collate(left, right []byte, rightIsPrefix bool) int { | ||
return collationBinary(left, right, rightIsPrefix) | ||
} | ||
|
||
func (c *Collation_8bit_bin) WeightString(dst []byte, numCodepoints int, src []byte, padToMax bool) []byte { | ||
copyCodepoints := minInt(len(src), minInt(cap(dst), numCodepoints)) | ||
dst = append(dst, src[:copyCodepoints]...) | ||
return weightStringPadingSimple(' ', dst, numCodepoints-copyCodepoints, padToMax) | ||
} | ||
|
||
func (c *Collation_8bit_bin) WeightStringLen(numBytes int) int { | ||
return numBytes | ||
} | ||
|
||
type Collation_8bit_simple_ci struct { | ||
id uint | ||
name string | ||
simpletables | ||
} | ||
|
||
func (c *Collation_8bit_simple_ci) init() {} | ||
|
||
func (c *Collation_8bit_simple_ci) Name() string { | ||
return c.name | ||
} | ||
|
||
func (c *Collation_8bit_simple_ci) Id() uint { | ||
return c.id | ||
} | ||
|
||
func (c *Collation_8bit_simple_ci) Collate(left, right []byte, rightIsPrefix bool) int { | ||
sortOrder := c.sort | ||
cmpLen := minInt(len(left), len(right)) | ||
|
||
for i := 0; i < cmpLen; i++ { | ||
sortL, sortR := sortOrder[int(left[i])], sortOrder[int(right[i])] | ||
if sortL != sortR { | ||
return int(sortL) - int(sortR) | ||
} | ||
} | ||
if rightIsPrefix { | ||
left = left[:cmpLen] | ||
} | ||
return len(left) - len(right) | ||
} | ||
|
||
func (c *Collation_8bit_simple_ci) WeightString(dst []byte, numCodepoints int, src []byte, padToMax bool) []byte { | ||
sortOrder := c.sort | ||
copyCodepoints := minInt(len(src), minInt(cap(dst), numCodepoints)) | ||
for _, ch := range src[:copyCodepoints] { | ||
dst = append(dst, sortOrder[int(ch)]) | ||
} | ||
return weightStringPadingSimple(' ', dst, numCodepoints-copyCodepoints, padToMax) | ||
} | ||
|
||
func (c *Collation_8bit_simple_ci) WeightStringLen(numBytes int) int { | ||
return numBytes | ||
} | ||
|
||
func weightStringPadingSimple(padChar byte, dst []byte, numCodepoints int, padToMax bool) []byte { | ||
if padToMax { | ||
for len(dst) < cap(dst) { | ||
dst = append(dst, padChar) | ||
} | ||
} else { | ||
for len(dst) < cap(dst) && numCodepoints > 0 { | ||
dst = append(dst, padChar) | ||
numCodepoints-- | ||
} | ||
} | ||
return dst | ||
} | ||
|
||
type Collation_binary struct{} | ||
|
||
func (c *Collation_binary) init() {} | ||
|
||
func (c *Collation_binary) Id() uint { | ||
return 63 | ||
} | ||
|
||
func (c *Collation_binary) Name() string { | ||
return "binary" | ||
} | ||
|
||
func (c *Collation_binary) Collate(left, right []byte, isPrefix bool) int { | ||
return collationBinary(left, right, isPrefix) | ||
} | ||
|
||
func (c *Collation_binary) WeightString(dst []byte, numCodepoints int, src []byte, padToMax bool) []byte { | ||
copyCodepoints := minInt(len(src), minInt(cap(dst), numCodepoints)) | ||
dst = append(dst, src[:copyCodepoints]...) | ||
if padToMax { | ||
for len(dst) < cap(dst) { | ||
dst = append(dst, 0x0) | ||
} | ||
} | ||
return dst | ||
} | ||
|
||
func (c *Collation_binary) WeightStringLen(numBytes int) int { | ||
return numBytes | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
package charset | ||
|
||
import ( | ||
"errors" | ||
"unicode/utf8" | ||
|
||
"golang.org/x/text/encoding/unicode" | ||
"golang.org/x/text/encoding/unicode/utf32" | ||
) | ||
|
||
const RuneError = utf8.RuneError | ||
|
||
type UnicodeCharset int | ||
|
||
const ( | ||
Charset_utf8mb4 UnicodeCharset = iota | ||
Charset_utf8 | ||
Charset_utf16 | ||
Charset_ucs2 | ||
Charset_utf32 | ||
) | ||
|
||
func (ch UnicodeCharset) SupportsSupplementaryChars() bool { | ||
if ch == Charset_utf8 || ch == Charset_ucs2 { | ||
return false | ||
} | ||
return true | ||
} | ||
|
||
type CodepointIterator func([]byte) (rune, int) | ||
|
||
func (ch UnicodeCharset) Iterator() CodepointIterator { | ||
switch ch { | ||
case Charset_utf8mb4: | ||
return utf8.DecodeRune | ||
case Charset_utf8: | ||
return iteratorUTF8mb3 | ||
case Charset_utf16: | ||
return iteratorUTF16BE | ||
case Charset_ucs2: | ||
return iteratorUCS2 | ||
case Charset_utf32: | ||
return iteratorUTF32 | ||
default: | ||
panic("bad charset") | ||
} | ||
} | ||
|
||
var defaultUTF16 = unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM) | ||
var defaultUTF32 = utf32.UTF32(utf32.BigEndian, utf32.IgnoreBOM) | ||
var errBMPRange = errors.New("input string contains characters outside of BMP range (cp > 0xFFFF)") | ||
|
||
func ensureBMPRange(in []byte) error { | ||
for _, cp := range string(in) { | ||
if cp > 0xFFFF { | ||
return errBMPRange | ||
} | ||
} | ||
return nil | ||
} | ||
|
||
func (ch UnicodeCharset) EncodeFromUTF8(in []byte) ([]byte, error) { | ||
switch ch { | ||
case Charset_utf8mb4: | ||
return in, nil | ||
|
||
case Charset_utf8: | ||
if err := ensureBMPRange(in); err != nil { | ||
return nil, err | ||
} | ||
return in, nil | ||
|
||
case Charset_utf16: | ||
return defaultUTF16.NewEncoder().Bytes(in) | ||
|
||
case Charset_ucs2: | ||
if err := ensureBMPRange(in); err != nil { | ||
return nil, err | ||
} | ||
return defaultUTF16.NewEncoder().Bytes(in) | ||
|
||
case Charset_utf32: | ||
return defaultUTF32.NewEncoder().Bytes(in) | ||
|
||
default: | ||
panic("bad charset") | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
package charset | ||
|
||
func iteratorUTF16BE(b []byte) (rune, int) { | ||
// 0xd800-0xdc00 encodes the high 10 bits of a pair. | ||
// 0xdc00-0xe000 encodes the low 10 bits of a pair. | ||
// the value is those 20 bits plus 0x10000. | ||
const ( | ||
surr1 = 0xd800 | ||
surr2 = 0xdc00 | ||
surr3 = 0xe000 | ||
surrSelf = 0x10000 | ||
) | ||
|
||
if len(b) < 2 { | ||
return RuneError, 0 | ||
} | ||
|
||
r1 := uint16(b[1]) | uint16(b[0])<<8 | ||
if r1 < surr1 || surr3 <= r1 { | ||
return rune(r1), 2 | ||
} | ||
|
||
if len(b) < 4 { | ||
return RuneError, 0 | ||
} | ||
|
||
r2 := uint16(b[3]) | uint16(b[2])<<8 | ||
if surr1 <= r1 && r1 < surr2 && surr2 <= r2 && r2 < surr3 { | ||
return (rune(r1)-surr1)<<10 | (rune(r2) - surr2) + surrSelf, 4 | ||
} | ||
|
||
return RuneError, 1 | ||
} | ||
|
||
func iteratorUCS2(p []byte) (rune, int) { | ||
if len(p) < 2 { | ||
return RuneError, 0 | ||
} | ||
return rune(p[0])<<8 | rune(p[1]), 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
package charset | ||
|
||
func iteratorUTF32(p []byte) (rune, int) { | ||
if len(p) < 4 { | ||
return RuneError, 0 | ||
} | ||
return (rune(p[0]) << 24) | (rune(p[1]) << 16) | (rune(p[2]) << 8) | rune(p[3]), 4 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
package charset | ||
|
||
const ( | ||
maskx = 0b00111111 | ||
mask2 = 0b00011111 | ||
mask3 = 0b00001111 | ||
|
||
// The default lowest and highest continuation byte. | ||
locb = 0b10000000 | ||
hicb = 0b10111111 | ||
|
||
// These names of these constants are chosen to give nice alignment in the | ||
// table below. The first nibble is an index into acceptRanges or F for | ||
// special one-byte cases. The second nibble is the Rune length or the | ||
// Status for the special one-byte case. | ||
xx = 0xF1 // invalid: size 1 | ||
as = 0xF0 // ASCII: size 1 | ||
s1 = 0x02 // accept 0, size 2 | ||
s2 = 0x13 // accept 1, size 3 | ||
s3 = 0x03 // accept 0, size 3 | ||
s4 = 0x23 // accept 2, size 3 | ||
) | ||
|
||
// first is information about the first byte in a UTF-8 sequence. | ||
var first = [256]uint8{ | ||
// 1 2 3 4 5 6 7 8 9 A B C D E F | ||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F | ||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F | ||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F | ||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F | ||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F | ||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F | ||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F | ||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F | ||
// 1 2 3 4 5 6 7 8 9 A B C D E F | ||
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F | ||
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F | ||
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF | ||
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF | ||
xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF | ||
s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF | ||
s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF | ||
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF | ||
} | ||
|
||
// acceptRange gives the range of valid values for the second byte in a UTF-8 | ||
// sequence. | ||
type acceptRange struct { | ||
lo uint8 // lowest value for second byte. | ||
hi uint8 // highest value for second byte. | ||
} | ||
|
||
// acceptRanges has size 16 to avoid bounds checks in the code that uses it. | ||
var acceptRanges = [16]acceptRange{ | ||
0: {locb, hicb}, | ||
1: {0xA0, hicb}, | ||
2: {locb, 0x9F}, | ||
3: {0x90, hicb}, | ||
4: {locb, 0x8F}, | ||
} | ||
|
||
func iteratorUTF8mb3(p []byte) (rune, int) { | ||
n := len(p) | ||
if n < 1 { | ||
return RuneError, 0 | ||
} | ||
p0 := p[0] | ||
x := first[p0] | ||
if x >= as { | ||
// The following code simulates an additional check for x == xx and | ||
// handling the ASCII and invalid cases accordingly. This mask-and-or | ||
// approach prevents an additional branch. | ||
mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF. | ||
return rune(p[0])&^mask | RuneError&mask, 1 | ||
} | ||
sz := int(x & 7) | ||
accept := acceptRanges[x>>4] | ||
if n < sz { | ||
return RuneError, 1 | ||
} | ||
b1 := p[1] | ||
if b1 < accept.lo || accept.hi < b1 { | ||
return RuneError, 1 | ||
} | ||
if sz <= 2 { // <= instead of == to help the compiler eliminate some bounds checks | ||
return rune(p0&mask2)<<6 | rune(b1&maskx), 2 | ||
} | ||
b2 := p[2] | ||
if b2 < locb || hicb < b2 { | ||
return RuneError, 1 | ||
} | ||
if sz <= 3 { | ||
return rune(p0&mask3)<<12 | rune(b1&maskx)<<6 | rune(b2&maskx), 3 | ||
} | ||
return RuneError, 1 | ||
} |
Oops, something went wrong.