Skip to content

Commit

Permalink
Merge pull request #9558 from vmg/mysql-conn-charset
Browse files Browse the repository at this point in the history
mysql: use UTF8mb4 as consistent connection charset
  • Loading branch information
vmg authored Jan 27, 2022
2 parents 0bd6b78 + e933177 commit af3f66d
Show file tree
Hide file tree
Showing 27 changed files with 113 additions and 219 deletions.
1 change: 0 additions & 1 deletion go/cmd/vttestserver/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,6 @@ func init() {
"Rdonly tablets per shard")

flag.StringVar(&config.Charset, "charset", "utf8mb4", "MySQL charset")
flag.StringVar(&config.Collation, "collation", "", "MySQL collation")

flag.StringVar(&config.PlannerVersion, "planner_version", "v3", "Sets the default planner to use when the session has not changed it. Valid values are: V3, Gen4, Gen4Greedy and Gen4Fallback. Gen4Fallback tries the new gen4 planner and falls back to the V3 planner if the gen4 fails. All Gen4 versions should be considered experimental!")

Expand Down
85 changes: 4 additions & 81 deletions go/mysql/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,14 +167,6 @@ func Connect(ctx context.Context, params *ConnParams) (*Conn, error) {
}
}

// Once we are connected to the server, we set the collation for this connection.
// This step usually occurs during the handshake, however, the handshake protocol
// grants us 8 bits for the collation ID, which is lower than the range of supported
// collations. For this reason, we manually set the collation for the connection.
if err := setCollationForConnection(c, params); err != nil {
return nil, err
}

return c, nil
}

Expand Down Expand Up @@ -202,58 +194,6 @@ func (c *Conn) Ping() error {
return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "unexpected packet type: %d", data[0])
}

// setCollationForConnection sets the connection's collation to the given collation.
//
// The charset should always be set as it has a default value ("utf8mb4"),
// however, one can always override its default to an empty string, which
// is not a problem as long as the user has specified the collation.
// If the collation flag was not specified when starting the tablet, we
// attempt to find the default collation for the current charset.
// If either the collation and charset are missing, or the resolution of
// the default collation using the given charset fails, we error out.
//
// This method is also responsible for creating and storing the collation
// environment that will be used by this connection. The collation environment
// allows us to make informed decisions around charset's default collation
// depending on the MySQL/MariaDB version we are using.
func setCollationForConnection(c *Conn, params *ConnParams) error {
// Once we have done the initial handshake with MySQL, we receive the server version
// string. This string is critical as it enables the instantiation of a new collation
// environment variable.
// Certain MySQL or MariaDB versions might have different default collations for some
// charsets, so it is important to use a database-version-aware collation system/API.
env := collations.NewEnvironment(c.ServerVersion)
coll, err := env.ResolveCollation(params.Charset, params.Collation)
if err != nil {
return err
}

// We send a query to MySQL to set the connection's collation.
// See: https://dev.mysql.com/doc/refman/8.0/en/charset-connection.html
querySetCollation := fmt.Sprintf("SET collation_connection = %s;", coll.Name())
if _, err := c.ExecuteFetch(querySetCollation, 1, false); err != nil {
return err
}

c.Collation = coll.ID()
return nil
}

// getHandshakeCharacterSet returns the collation ID of DefaultCollation in an
// 8 bits integer which will be used to feed the handshake protocol's packet.
func getHandshakeCharacterSet() (uint8, error) {
coll := collations.Local().LookupByName(DefaultCollation)
if coll == nil {
// theoretically, this should never happen from an end user perspective
return 0, vterrors.Errorf(vtrpcpb.Code_INTERNAL, "cannot resolve collation ID for collation: '%s'", DefaultCollation)
}
if coll.ID() > 255 {
// same here, this should never happen
return 0, vterrors.Errorf(vtrpcpb.Code_INTERNAL, "collation ID for '%s' will overflow, value: %d", DefaultCollation, coll.ID())
}
return uint8(coll.ID()), nil
}

// clientHandshake handles the client side of the handshake.
// Note the connection can be closed while this is running.
// Returns a SQLError.
Expand Down Expand Up @@ -282,24 +222,7 @@ func (c *Conn) clientHandshake(params *ConnParams) error {
c.Capabilities = capabilities & (CapabilityClientDeprecateEOF)
}

// The MySQL handshake package uses the "character set" field to define
// which character set must be used. But, the value we give to this field
// correspond in fact to the collation ID. MySQL will then deduce what the
// character set for this collation ID is, and use it.
// Problem is, this field is 8-bits long meaning that the ID can range from
// 0 to 255, which is smaller than the range of IDs we support.
// If, for instance, we used the collation "utf8mb4_0900_as_ci" that has an
// ID equal to 305, the value would overflow when transformed into an 8 bits
// integer.
// To alleviate this issue, we use a default and safe collation for the handshake
// and once the connection is established, we will manually set the collation.
// The code below gets that default character set for the Handshake packet.
//
// Note: this character set might be different from the one we will use
// for the connection.
//
// See: https://dev.mysql.com/doc/internals/en/connection-phase-packets.html
characterSet, err := getHandshakeCharacterSet()
charset, err := collations.Local().ParseConnectionCharset(params.Charset)
if err != nil {
return err
}
Expand Down Expand Up @@ -340,7 +263,7 @@ func (c *Conn) clientHandshake(params *ConnParams) error {
}

// Send the SSLRequest packet.
if err := c.writeSSLRequest(capabilities, characterSet, params); err != nil {
if err := c.writeSSLRequest(capabilities, charset, params); err != nil {
return err
}

Expand Down Expand Up @@ -371,7 +294,7 @@ func (c *Conn) clientHandshake(params *ConnParams) error {

// Build and send our handshake response 41.
// Note this one will never have SSL flag on.
if err := c.writeHandshakeResponse41(capabilities, scrambledPassword, characterSet, params); err != nil {
if err := c.writeHandshakeResponse41(capabilities, scrambledPassword, charset, params); err != nil {
return err
}

Expand Down Expand Up @@ -473,7 +396,7 @@ func (c *Conn) parseInitialHandshakePacket(data []byte) (uint32, []byte, error)
if !ok {
return 0, nil, NewSQLError(CRMalformedPacket, SSUnknownSQLState, "parseInitialHandshakePacket: packet has no character set")
}
c.CharacterSet = characterSet
c.CharacterSet = collations.ID(characterSet)

// Status flags. Ignored.
_, pos, ok = readUint16(data, pos)
Expand Down
2 changes: 0 additions & 2 deletions go/mysql/collations/8bit.go
Original file line number Diff line number Diff line change
Expand Up @@ -224,8 +224,6 @@ func weightStringPadingSimple(padChar byte, dst []byte, numCodepoints int, padTo
return dst
}

const CollationBinaryID ID = 63

type Collation_binary struct{}

func (c *Collation_binary) Init() {}
Expand Down
76 changes: 50 additions & 26 deletions go/mysql/collations/env.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,32 +123,6 @@ func fetchCacheEnvironment(version collver) *Environment {
return env
}

// ResolveCollation returns the default collation that will be used for the given charset and collation.
// Both charset and collation can be empty strings, in which case utf8mb4 will be used as a charset and its
// default collation will be returned.
func (env *Environment) ResolveCollation(charset, collation string) (Collation, error) {
// if there is no collation or charset, we default to utf8mb4
if collation == "" && charset == "" {
charset = "utf8mb4"
}

var coll Collation
if collation == "" {
// If there is no collation we will just use the charset's default collation
// otherwise we directly use the given collation.
coll = env.DefaultCollationForCharset(charset)
} else {
// Here we call the collations API to ensure the collation/charset exist
// and is supported by Vitess.
coll = env.LookupByName(collation)
}
if coll == nil {
// The given collation is most likely unknown or unsupported, we need to fail.
return nil, fmt.Errorf("cannot resolve collation: '%s'", collation)
}
return coll, nil
}

// NewEnvironment creates a collation Environment for the given MySQL version string.
// The version string must be in the format that is sent by the server as the version packet
// when opening a new MySQL connection
Expand Down Expand Up @@ -246,3 +220,53 @@ func Local() *Environment {
})
return defaultEnv
}

// A few interesting character set values.
// See http://dev.mysql.com/doc/internals/en/character-set.html#packet-Protocol::CharacterSet
const (
CollationUtf8ID = 33
CollationUtf8mb4ID = 255
CollationBinaryID = 63
)

// DefaultConnectionCharset is the default charset that Vitess will use when negotiating a
// charset in a MySQL connection handshake. Note that in this context, a 'charset' is equivalent
// to a Collation ID, with the exception that it can only fit in 1 byte.
// For MySQL 8.0+ environments, the default charset is `utf8mb4_0900_ai_ci`.
// For older MySQL environments, the default charset is `utf8mb4_general_ci`.
func (env *Environment) DefaultConnectionCharset() uint8 {
switch env.version {
case collverMySQL80:
return CollationUtf8mb4ID
default:
return 45
}
}

// ParseConnectionCharset parses the given charset name and returns its numerical
// identifier to be used in a MySQL connection handshake. The charset name can be:
// - the name of a character set, in which case the default collation ID for the
// character set is returned.
// - the name of a collation, in which case the ID for the collation is returned,
// UNLESS the collation itself has an ID greater than 255; such collations are not
// supported because they cannot be negotiated in a single byte in our connection
// handshake.
// - empty, in which case the default connection charset for this MySQL version
// is returned.
func (env *Environment) ParseConnectionCharset(csname string) (uint8, error) {
if csname == "" {
return env.DefaultConnectionCharset(), nil
}

var collid ID = 0
csname = strings.ToLower(csname)
if defaults, ok := env.byCharset[csname]; ok {
collid = defaults.Default.ID()
} else if coll, ok := env.byName[csname]; ok {
collid = coll.ID()
}
if collid == 0 || collid > 255 {
return 0, fmt.Errorf("unsupported connection charset: %q", csname)
}
return uint8(collid), nil
}
19 changes: 8 additions & 11 deletions go/mysql/conn.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,17 +182,14 @@ type Conn struct {
// by Handler methods.
StatusFlags uint16

// CharacterSet is the character set used by the other side of the
// connection.
// It is set during the initial handshake.
// See the values in constants.go.
CharacterSet uint8

// Collation defines the collation for this connection, it has the same
// value as the collation_connection variable of MySQL.
// Its value is set after we send the initial "SET collation_connection"
// query to MySQL after the handshake is done.
Collation collations.ID
// CharacterSet is the charset for this connection, as negotiated
// in our handshake with the server. Note that although the MySQL protocol lists this
// as a "character set", the returned byte value is actually a Collation ID,
// and hence it's casted as such here.
// If the user has specified a custom Collation in the ConnParams for this
// connection, once the CharacterSet has been negotiated, we will override
// it via SQL and update this field accordingly.
CharacterSet collations.ID

// Packet encoding variables.
sequence uint8
Expand Down
1 change: 0 additions & 1 deletion go/mysql/conn_params.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ type ConnParams struct {
DbName string `json:"dbname"`
UnixSocket string `json:"unix_socket"`
Charset string `json:"charset"`
Collation string `json:"collation"`
Flags uint64 `json:"flags"`
Flavor string `json:"flavor,omitempty"`

Expand Down
15 changes: 0 additions & 15 deletions go/mysql/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -591,21 +591,6 @@ const (
SSQueryInterrupted = "70100"
)

// A few interesting character set values.
// See http://dev.mysql.com/doc/internals/en/character-set.html#packet-Protocol::CharacterSet
const (
DefaultCollation = "utf8mb4_general_ci"

// CharacterSetUtf8 is for UTF8.
CharacterSetUtf8 = 33

// CharacterSetUtf8mb4 is for 4-bytes UTF8.
CharacterSetUtf8mb4 = 45

// CharacterSetBinary is for binary. Use by integer fields for instance.
CharacterSetBinary = 63
)

// CharacterSetEncoding maps a charset name to a golang encoder.
// golang does not support encoders for all MySQL charsets.
// A charset not in this map is unsupported.
Expand Down
21 changes: 15 additions & 6 deletions go/mysql/endtoend/query_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,20 @@ import (
"github.com/stretchr/testify/require"

"vitess.io/vitess/go/mysql"
"vitess.io/vitess/go/mysql/collations"
"vitess.io/vitess/go/sqltypes"

querypb "vitess.io/vitess/go/vt/proto/query"
)

func columnSize(cs collations.ID, size uint32) uint32 {
// utf8_general_ci results in smaller max column sizes because MySQL 5.7 is silly
if collations.Local().LookupByID(cs).Charset().Name() == "utf8" {
return size * 3 / 4
}
return size
}

// Test the SQL query part of the API.
func TestQueries(t *testing.T) {
ctx := context.Background()
Expand Down Expand Up @@ -79,7 +88,7 @@ func TestQueries(t *testing.T) {
Database: "vttest",
OrgName: "id",
ColumnLength: 11,
Charset: mysql.CharacterSetBinary,
Charset: collations.CollationBinaryID,
Flags: uint32(querypb.MySqlFlag_NOT_NULL_FLAG |
querypb.MySqlFlag_PRI_KEY_FLAG |
querypb.MySqlFlag_PART_KEY_FLAG |
Expand All @@ -92,8 +101,8 @@ func TestQueries(t *testing.T) {
OrgTable: "a",
Database: "vttest",
OrgName: "name",
ColumnLength: 512,
Charset: mysql.CharacterSetUtf8mb4,
ColumnLength: columnSize(conn.CharacterSet, 512),
Charset: uint32(conn.CharacterSet),
},
},
Rows: [][]sqltypes.Value{
Expand Down Expand Up @@ -188,7 +197,7 @@ func readRowsUsingStream(t *testing.T, conn *mysql.Conn, expectedCount int) {
Database: "vttest",
OrgName: "id",
ColumnLength: 11,
Charset: mysql.CharacterSetBinary,
Charset: collations.CollationBinaryID,
Flags: uint32(querypb.MySqlFlag_NOT_NULL_FLAG |
querypb.MySqlFlag_PRI_KEY_FLAG |
querypb.MySqlFlag_PART_KEY_FLAG |
Expand All @@ -201,8 +210,8 @@ func readRowsUsingStream(t *testing.T, conn *mysql.Conn, expectedCount int) {
OrgTable: "a",
Database: "vttest",
OrgName: "name",
ColumnLength: 512,
Charset: mysql.CharacterSetUtf8mb4,
ColumnLength: columnSize(conn.CharacterSet, 512),
Charset: uint32(conn.CharacterSet),
},
}
fields, err := conn.Fields()
Expand Down
3 changes: 0 additions & 3 deletions go/mysql/fakesqldb/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -272,8 +272,6 @@ func (db *DB) ConnParams() dbconfigs.Connector {
UnixSocket: db.socketFile,
Uname: "user1",
Pass: "password1",
Charset: "utf8mb4",
Collation: "utf8mb4_general_ci",
})
}

Expand All @@ -283,7 +281,6 @@ func (db *DB) ConnParamsWithUname(uname string) dbconfigs.Connector {
UnixSocket: db.socketFile,
Uname: uname,
Pass: "password1",
Charset: "utf8",
})
}

Expand Down
Loading

0 comments on commit af3f66d

Please sign in to comment.