Merge pull request #8991 from vmg/collations

MySQL-compatible Collations
vitessio · Oct 19, 2021 · 0f1ee35 · 0f1ee35
2 parents f7df1e8 + 09d731f
commit 0f1ee35
Show file tree

Hide file tree

Showing 61 changed files with 89,546 additions and 0 deletions.
diff --git a/go/mysql/collations/8bit.go b/go/mysql/collations/8bit.go
@@ -0,0 +1,180 @@
+/*
+Copyright 2021 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package collations
+
+func init() {
+	register(&Collation_binary{})
+}
+
+type simpletables struct {
+	tounicode []uint16
+	tolower   []byte
+	toupper   []byte
+	ctype     []byte
+	sort      []byte
+}
+
+type Collation_8bit_bin struct {
+	id   uint
+	name string
+	simpletables
+}
+
+func (c *Collation_8bit_bin) init() {}
+
+func (c *Collation_8bit_bin) Name() string {
+	return c.name
+}
+func (c *Collation_8bit_bin) Id() uint {
+	return c.id
+}
+
+func (c *Collation_8bit_bin) Collate(left, right []byte, rightIsPrefix bool) int {
+	return collationBinary(left, right, rightIsPrefix)
+}
+
+func (c *Collation_8bit_bin) WeightString(dst, src []byte, numCodepoints int) []byte {
+	copyCodepoints := minInt(len(src), cap(dst))
+
+	var padToMax bool
+	switch numCodepoints {
+	case 0:
+		numCodepoints = copyCodepoints
+	case PadToMax:
+		padToMax = true
+	default:
+		copyCodepoints = minInt(copyCodepoints, numCodepoints)
+	}
+
+	dst = append(dst, src[:copyCodepoints]...)
+	return weightStringPadingSimple(' ', dst, numCodepoints-copyCodepoints, padToMax)
+}
+
+func (c *Collation_8bit_bin) WeightStringLen(numBytes int) int {
+	return numBytes
+}
+
+type Collation_8bit_simple_ci struct {
+	id   uint
+	name string
+	simpletables
+}
+
+func (c *Collation_8bit_simple_ci) init() {}
+
+func (c *Collation_8bit_simple_ci) Name() string {
+	return c.name
+}
+
+func (c *Collation_8bit_simple_ci) Id() uint {
+	return c.id
+}
+
+func (c *Collation_8bit_simple_ci) Collate(left, right []byte, rightIsPrefix bool) int {
+	sortOrder := c.sort
+	cmpLen := minInt(len(left), len(right))
+
+	for i := 0; i < cmpLen; i++ {
+		sortL, sortR := sortOrder[int(left[i])], sortOrder[int(right[i])]
+		if sortL != sortR {
+			return int(sortL) - int(sortR)
+		}
+	}
+	if rightIsPrefix {
+		left = left[:cmpLen]
+	}
+	return len(left) - len(right)
+}
+
+func (c *Collation_8bit_simple_ci) WeightString(dst, src []byte, numCodepoints int) []byte {
+	padToMax := false
+	sortOrder := c.sort
+	copyCodepoints := minInt(len(src), cap(dst))
+
+	switch numCodepoints {
+	case 0:
+		numCodepoints = copyCodepoints
+	case PadToMax:
+		padToMax = true
+	default:
+		copyCodepoints = minInt(copyCodepoints, numCodepoints)
+	}
+
+	for _, ch := range src[:copyCodepoints] {
+		dst = append(dst, sortOrder[int(ch)])
+	}
+	return weightStringPadingSimple(' ', dst, numCodepoints-copyCodepoints, padToMax)
+}
+
+func (c *Collation_8bit_simple_ci) WeightStringLen(numBytes int) int {
+	return numBytes
+}
+
+func weightStringPadingSimple(padChar byte, dst []byte, numCodepoints int, padToMax bool) []byte {
+	if padToMax {
+		for len(dst) < cap(dst) {
+			dst = append(dst, padChar)
+		}
+	} else {
+		for numCodepoints > 0 {
+			dst = append(dst, padChar)
+			numCodepoints--
+		}
+	}
+	return dst
+}
+
+type Collation_binary struct{}
+
+func (c *Collation_binary) init() {}
+
+func (c *Collation_binary) Id() uint {
+	return 63
+}
+
+func (c *Collation_binary) Name() string {
+	return "binary"
+}
+
+func (c *Collation_binary) Collate(left, right []byte, isPrefix bool) int {
+	return collationBinary(left, right, isPrefix)
+}
+
+func (c *Collation_binary) WeightString(dst, src []byte, numCodepoints int) []byte {
+	padToMax := false
+	copyCodepoints := minInt(len(src), cap(dst))
+
+	switch numCodepoints {
+	case 0: // no-op
+	case PadToMax:
+		padToMax = true
+	default:
+		copyCodepoints = minInt(copyCodepoints, numCodepoints)
+	}
+
+	dst = append(dst, src[:copyCodepoints]...)
+	if padToMax {
+		for len(dst) < cap(dst) {
+			dst = append(dst, 0x0)
+		}
+	}
+	return dst
+}
+
+func (c *Collation_binary) WeightStringLen(numBytes int) int {
+	return numBytes
+}
diff --git a/go/mysql/collations/collation.go b/go/mysql/collations/collation.go
@@ -0,0 +1,161 @@
+/*
+Copyright 2021 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package collations
+
+import (
+	"fmt"
+	"math"
+)
+
+// Generate mysqldata.go from the JSON information dumped from MySQL
+//go:generate go run ./tools/makemysqldata/
+
+// Collation implements a MySQL-compatible collation. It defines how to compare
+// for sorting order and equality two strings with the same encoding.
+type Collation interface {
+	// init initializes the internal state for the collation the first time it is used
+	init()
+
+	// Id returns the numerical identifier for this collation. This is the same
+	// value that is returned by MySQL in a query's headers to identify the collation
+	// for a given column
+	Id() uint
+
+	// Name is the full name of this collation, in the form of "ENCODING_LANG_SENSITIVITY"
+	Name() string
+
+	// Collate compares two strings using this collation. `left` and `right` must be the
+	// two strings encoded in the proper encoding for this collation. If `isPrefix` is true,
+	// the function instead behaves equivalently to `strings.HasPrefix(left, right)`, but
+	// being collation-aware.
+	// It returns a numeric value like a normal comparison function: <0 if left < right,
+	// 0 if left == right, >0 if left > right
+	Collate(left, right []byte, isPrefix bool) int
+
+	// WeightString returns a weight string for the given `src` string. A weight string
+	// is a binary representation of the weights for the given string, that can be
+	// compared byte-wise to return identical results to collating this string.
+	//
+	// This means:
+	//		bytes.Compare(WeightString(left), WeightString(right)) == Collate(left, right)
+	//
+	// The semantics of this API have been carefully designed to match MySQL's behavior
+	// in its `strnxfrm` API. Most notably, the `numCodepoints` argument implies different
+	// behaviors depending on the collation's padding mode:
+	//
+	// - For collations that pad WITH SPACE (this is, all legacy collations in MySQL except
+	//	for the newly introduced UCA v9.0.0 utf8mb4 collations in MySQL 8.0), `numCodepoints`
+	// 	can have the following values:
+	//
+	//		- if `numCodepoints` is any integer greater than zero, this treats the `src` string
+	//		as if it were in a `CHAR(numCodepoints)` column in MySQL, meaning that the resulting
+	//		weight string will be padded with the weight for the SPACE character until it becomes
+	//		wide enough to fill the `CHAR` column. This is necessary to perform weight comparisons
+	//		in fixed-`CHAR` columns. If `numCodepoints` is smaller than the actual amount of
+	//		codepoints stored in `src`, the result is unspecified.
+	//
+	//		- if `numCodepoints` is zero, this is equivalent to `numCodepoints = RuneCount(src)`,
+	//		meaning that the resulting weight string will have no padding at the end: it'll only have
+	//		the weight values for the exact amount of codepoints contained in `src`. This is the
+	//		behavior required to sort `VARCHAR` columns.
+	//
+	//		- if `numCodepoints` is the special constant PadToMax, then the `dst` slice must be
+	//		pre-allocated to a zero-length slice with enough capacity to hold the complete weight
+	//		string, and any remaining capacity in `dst` will be filled by the weights for the
+	//		padding character, repeatedly. This is a special flag used by MySQL when performing
+	//		filesorts, where all the sorting keys must have identical sizes, even for `VARCHAR`
+	//		columns.
+	//
+	//	- For collations that have NO PAD (this is, the newly introduced UCA v9.0.0 utf8mb4 collations
+	//	in MySQL 8.0), `numCodepoints` can only have the special constant `PadToMax`, which will make
+	//	the weight string padding equivalent to a PAD SPACE collation (as explained in the previous
+	//	section). All other values for `numCodepoints` are ignored, because NO PAD collations always
+	//	return the weights for the codepoints in their strings, with no further padding at the end.
+	//
+	// The resulting weight string is written to `dst`, which can be pre-allocated to
+	// WeightStringLen() bytes to prevent growing the slice. `dst` can also be nil, in which
+	// case it will grow dynamically. If `numCodepoints` has the special PadToMax value explained
+	// earlier, `dst` MUST be pre-allocated to the target size or the function will return an
+	// empty slice.
+	WeightString(dst, src []byte, numCodepoints int) []byte
+
+	// WeightStringLen returns a size (in bytes) that would fit any weight strings for a string
+	// with `numCodepoints` using this collation. Note that this is a higher bound for the size
+	// of the string, and in practice weight strings can be significantly smaller than the
+	// returned value.
+	WeightStringLen(numCodepoints int) int
+}
+
+const PadToMax = math.MaxInt32
+
+func minInt(i1, i2 int) int {
+	if i1 < i2 {
+		return i1
+	}
+	return i2
+}
+
+var collationsByName = make(map[string]Collation)
+var collationsById = make(map[uint]Collation)
+
+func register(c Collation) {
+	duplicatedCharset := func(old Collation) {
+		panic(fmt.Sprintf("duplicated collation: %s[%d] (existing collation is %s[%d])",
+			c.Name(), c.Id(), old.Name(), old.Id(),
+		))
+	}
+	if old, found := collationsByName[c.Name()]; found {
+		duplicatedCharset(old)
+	}
+	if old, found := collationsById[c.Id()]; found {
+		duplicatedCharset(old)
+	}
+	collationsByName[c.Name()] = c
+	collationsById[c.Id()] = c
+}
+
+// LookupByName returns the collation with the given name. The collation
+// is initialized if it's the first time being accessed.
+func LookupByName(name string) Collation {
+	csi := collationsByName[name]
+	if csi != nil {
+		csi.init()
+	}
+	return csi
+}
+
+// LookupById returns the collation with the given numerical identifier. The collation
+// is initialized if it's the first time being accessed.
+func LookupById(id uint) Collation {
+	csi := collationsById[id]
+	if csi != nil {
+		csi.init()
+	}
+	return csi
+}
+
+// All returns a slice with all known collations in Vitess. This is an expensive call because
+// it will initialize the internal state of all the collations before returning them.
+// Used for testing/debugging.
+func All() (all []Collation) {
+	all = make([]Collation, 0, len(collationsById))
+	for _, col := range collationsById {
+		col.init()
+		all = append(all, col)
+	}
+	return
+}
diff --git a/go/mysql/collations/golden_test.go b/go/mysql/collations/golden_test.go
@@ -0,0 +1,41 @@
+package collations
+
+import (
+	"bytes"
+	"fmt"
+	"path/filepath"
+	"testing"
+)
+
+func TestGoldenWeights(t *testing.T) {
+	gllGoldenTests, err := filepath.Glob("testdata/wiki_*.gob.gz")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	for _, goldenPath := range gllGoldenTests {
+		golden := &GoldenTest{}
+		if err := golden.DecodeFromFile(goldenPath); err != nil {
+			t.Fatal(err)
+		}
+
+		for _, goldenCase := range golden.Cases {
+			t.Run(fmt.Sprintf("%s (%s)", golden.Name, goldenCase.Lang), func(t *testing.T) {
+				for coll, expected := range goldenCase.Weights {
+					coll := testcollation(t, coll)
+
+					input, err := coll.(CollationUCA).Encoding().EncodeFromUTF8(goldenCase.Text)
+					if err != nil {
+						t.Fatal(err)
+					}
+
+					result := coll.WeightString(nil, input, 0)
+					if !bytes.Equal(expected, result) {
+						t.Errorf("mismatch for collation=%s\noriginal: %s\ninput:    %#v\nexpected: %v\nactual:   %v",
+							coll.Name(), string(goldenCase.Text), input, expected, result)
+					}
+				}
+			})
+		}
+	}
+}