Skip to content

Commit

Permalink
vindexes: Efficient unicode hashing (#14395)
Browse files Browse the repository at this point in the history
Signed-off-by: Vicent Marti <[email protected]>
Signed-off-by: Dirkjan Bussink <[email protected]>
Co-authored-by: Dirkjan Bussink <[email protected]>
  • Loading branch information
vmg and dbussink authored Oct 31, 2023
1 parent eddb7da commit 7388255
Show file tree
Hide file tree
Showing 39 changed files with 83,546 additions and 153 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ require (
golang.org/x/oauth2 v0.7.0
golang.org/x/sys v0.13.0
golang.org/x/term v0.13.0
golang.org/x/text v0.13.0
golang.org/x/text v0.13.0 // indirect
golang.org/x/time v0.3.0
golang.org/x/tools v0.12.1-0.20230815132531-74c255bcf846
google.golang.org/api v0.121.0
Expand Down
2 changes: 1 addition & 1 deletion go/mysql/collations/colldata/golden_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ import (
"github.com/stretchr/testify/assert"

"vitess.io/vitess/go/mysql/collations/charset"
"vitess.io/vitess/go/mysql/collations/internal/testutil"
"vitess.io/vitess/go/mysql/collations/testutil"
)

func TestGoldenWeights(t *testing.T) {
Expand Down
22 changes: 18 additions & 4 deletions go/mysql/collations/integration/collations_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,17 @@ import (
"bufio"
"bytes"
"encoding/hex"
"errors"
"fmt"
"os"
"path"
"path/filepath"
"strings"
"testing"
"unicode/utf8"

"github.com/spf13/pflag"
"github.com/stretchr/testify/require"
"golang.org/x/text/encoding/unicode/utf32"

"vitess.io/vitess/go/mysql/collations/colldata"

Expand Down Expand Up @@ -95,16 +96,29 @@ type uca900CollationTest struct {
collation string
}

var defaultUtf32 = utf32.UTF32(utf32.BigEndian, utf32.IgnoreBOM)
func decodeUtf32(dst, src []byte) ([]byte, error) {
for len(src) >= 4 {
r := rune(uint32(src[0])<<24 | uint32(src[1])<<16 | uint32(src[2])<<8 | uint32(src[3]))
dst = utf8.AppendRune(dst, r)
src = src[4:]
}
if len(src) != 0 {
return nil, errors.New("short src")
}
return dst, nil
}

func parseUtf32cp(b []byte) []byte {
var hexbuf [16]byte
c, err := hex.Decode(hexbuf[:], b)
if err != nil {
return nil
}
utf8, _ := defaultUtf32.NewDecoder().Bytes(hexbuf[:c])
return utf8
dst, err := decodeUtf32(nil, hexbuf[:c])
if err != nil {
panic("failed to decode utf32")
}
return dst
}

func parseWeightString(b []byte) []byte {
Expand Down
2 changes: 1 addition & 1 deletion go/mysql/collations/integration/weight_string_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ import (
"vitess.io/vitess/go/mysql/collations"
"vitess.io/vitess/go/mysql/collations/charset"
"vitess.io/vitess/go/mysql/collations/colldata"
"vitess.io/vitess/go/mysql/collations/internal/testutil"
"vitess.io/vitess/go/mysql/collations/remote"
"vitess.io/vitess/go/mysql/collations/testutil"
)

func TestFastIterators(t *testing.T) {
Expand Down
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion go/mysql/collations/tools/maketestdata/maketestdata.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@ import (
"github.com/spf13/pflag"

"vitess.io/vitess/go/mysql/collations/colldata"
"vitess.io/vitess/go/mysql/collations/testutil"

"vitess.io/vitess/go/internal/flag"
"vitess.io/vitess/go/mysql/collations"
"vitess.io/vitess/go/mysql/collations/charset"
"vitess.io/vitess/go/mysql/collations/internal/testutil"
)

func wikiRequest(lang testutil.Lang, args map[string]string, output any) error {
Expand Down
6 changes: 6 additions & 0 deletions go/mysql/collations/vindex/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Add no patterns to .gitignore except for files generated by the build.
last-change
/DATA
# This file is rather large and the tests really only need to be run
# after generation.
/unicode/norm/data_test.go
27 changes: 27 additions & 0 deletions go/mysql/collations/vindex/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
Copyright (c) 2009 The Go Authors. All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
22 changes: 22 additions & 0 deletions go/mysql/collations/vindex/PATENTS
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
Additional IP Rights Grant (Patents)

"This implementation" means the copyrightable works distributed by
Google as part of the Go project.

Google hereby grants to You a perpetual, worldwide, non-exclusive,
no-charge, royalty-free, irrevocable (except as stated in this section)
patent license to make, have made, use, offer to sell, sell, import,
transfer and otherwise run, modify and propagate the contents of this
implementation of Go, where such license applies only to those patent
claims, both currently owned or controlled by Google and acquired in
the future, licensable by Google that are necessarily infringed by this
implementation of Go. This grant does not include claims that would be
infringed only as a consequence of further modification of this
implementation. If you or your agent or exclusive licensee institute or
order or agree to the institution of patent litigation against any
entity (including a cross-claim or counterclaim in a lawsuit) alleging
that this implementation of Go or any code incorporated within this
implementation of Go constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any patent
rights granted to you under this License for this implementation of Go
shall terminate as of the date such litigation is filed.
68 changes: 68 additions & 0 deletions go/mysql/collations/vindex/collate/collate.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// TODO: remove hard-coded versions when we have implemented fractional weights.
// The current implementation is incompatible with later CLDR versions.
//go:generate go run maketables.go -cldr=23 -unicode=6.2.0

// Package collate contains types for comparing and sorting Unicode strings
// according to a given collation order.
package collate // import "vitess.io/vitess/go/mysql/collations/vindex/collate"

import (
"hash"

"vitess.io/vitess/go/mysql/collations/vindex/internal/colltab"
)

type Hasher struct {
iter colltab.Iter
hash hash.Hash
scratch [32]colltab.Elem
}

// New returns a new Hasher initialized for the given hash function
func New(h hash.Hash) *Hasher {
c := &Hasher{}
c.iter.Weighter = getTable(tableIndex{0x15, 0x0})
c.iter.Elems = c.scratch[:0]
c.hash = h
return c
}

func (c *Hasher) Hash(str []byte) []byte {
c.hash.Reset()
c.iter.SetInput(str)

var scratch [64]byte
var pos int

for c.iter.Next() {
for n := 0; n < c.iter.N; n++ {
if w := c.iter.Elems[n].Primary(); w > 0 {
if w <= 0x7FFF {
if len(scratch)-pos < 2 {
c.hash.Write(scratch[:pos])
pos = 0
}
scratch[pos+0] = uint8(w >> 8)
scratch[pos+1] = uint8(w)
pos += 2
} else {
if len(scratch)-pos < 3 {
c.hash.Write(scratch[:pos])
pos = 0
}
scratch[pos+0] = uint8(w>>16) | 0x80
scratch[pos+1] = uint8(w >> 8)
scratch[pos+2] = uint8(w)
pos += 3
}
}
}
c.iter.Discard()
}
c.hash.Write(scratch[:pos])
return c.hash.Sum(nil)
}
30 changes: 30 additions & 0 deletions go/mysql/collations/vindex/collate/index.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package collate

import "vitess.io/vitess/go/mysql/collations/vindex/internal/colltab"

const blockSize = 64

func getTable(t tableIndex) *colltab.Table {
return &colltab.Table{
Index: colltab.Trie{
Index0: mainLookup[:][blockSize*t.lookupOffset:],
Values0: mainValues[:][blockSize*t.valuesOffset:],
Index: mainLookup[:],
Values: mainValues[:],
},
ExpandElem: mainExpandElem[:],
ContractTries: mainCTEntries[:],
ContractElem: mainContractElem[:],
}
}

// tableIndex holds information for constructing a table
// for a certain locale based on the main table.
type tableIndex struct {
lookupOffset uint32
valuesOffset uint32
}
Loading

0 comments on commit 7388255

Please sign in to comment.