diff --git a/cmd/explaintest/r/new_character_set_builtin.result b/cmd/explaintest/r/new_character_set_builtin.result index b4944f3057611..8772baab07111 100644 Binary files a/cmd/explaintest/r/new_character_set_builtin.result and b/cmd/explaintest/r/new_character_set_builtin.result differ diff --git a/cmd/explaintest/t/new_character_set_builtin.test b/cmd/explaintest/t/new_character_set_builtin.test index 0c6fefac7919d..3726951591461 100644 --- a/cmd/explaintest/t/new_character_set_builtin.test +++ b/cmd/explaintest/t/new_character_set_builtin.test @@ -57,6 +57,13 @@ select hex(convert('ㅂ' using gbk)), convert('ㅂ' using gbk); select convert(a using binary) from t; select convert(convert('中文' using gbk) using binary), convert('中文' using binary); select convert(convert('ㅂ' using gbk) using binary), convert('ㅂ' using binary); +drop table if exists t; +create table t(a binary(10)); +insert into t values (0xe240), (0x01e240); +set @@tidb_enable_vectorized_expression = true; +select hex(convert(a using gbk)), convert(a using gbk) from t; +set @@tidb_enable_vectorized_expression = false; +select hex(convert(a using gbk)), convert(a using gbk) from t; -- test for builtin function md5() drop table if exists t; diff --git a/expression/builtin_string.go b/expression/builtin_string.go index bf948605045b2..86f426be2faaa 100644 --- a/expression/builtin_string.go +++ b/expression/builtin_string.go @@ -41,7 +41,6 @@ import ( "github.com/pingcap/tidb/util/logutil" "github.com/pingcap/tipb/go-tipb" "go.uber.org/zap" - "golang.org/x/text/transform" ) var ( @@ -1144,29 +1143,26 @@ func (b *builtinConvertSig) evalString(row chunk.Row) (string, bool, error) { // Since charset is already validated and set from getFunction(), there's no // need to get charset from args again. - encoding, _ := charset.Lookup(b.tp.Charset) + enc := charset.NewEncoding(b.tp.Charset) // However, if `b.tp.Charset` is abnormally set to a wrong charset, we still // return with error. - if encoding == nil { + if enc.Name() == "" { return "", true, errUnknownCharacterSet.GenWithStackByArgs(b.tp.Charset) } // if expr is binary string and convert meet error, we should return NULL. if types.IsBinaryStr(b.args[0].GetType()) { - target, _, err := transform.String(encoding.NewEncoder(), expr) + target, err := enc.DecodeString(expr) if err != nil { - return "", true, err + b.ctx.GetSessionVars().StmtCtx.AppendWarning(err) + return "", true, nil } - - // we should convert target into utf8 internal. - exprInternal, _, _ := transform.String(encoding.NewDecoder(), target) - return exprInternal, false, nil + return target, false, nil } if types.IsBinaryStr(b.tp) { enc := charset.NewEncoding(b.args[0].GetType().Charset) expr, err = enc.EncodeString(expr) return expr, false, err } - enc := charset.NewEncoding(b.tp.Charset) return string(enc.EncodeInternal(nil, []byte(expr))), false, nil } diff --git a/expression/builtin_string_test.go b/expression/builtin_string_test.go index 28d98d0215091..22f3ea294d129 100644 --- a/expression/builtin_string_test.go +++ b/expression/builtin_string_test.go @@ -885,14 +885,16 @@ func TestConvert(t *testing.T) { str interface{} cs string result string + isNull bool hasBinaryFlag bool }{ - {"haha", "utf8", "haha", false}, - {"haha", "ascii", "haha", false}, - {"haha", "binary", "haha", true}, - {"haha", "bInAry", "haha", true}, - {types.NewBinaryLiteralFromUint(0x7e, -1), "BiNarY", "~", true}, - {types.NewBinaryLiteralFromUint(0xe4b8ade696870a, -1), "uTf8", "中文\n", false}, + {"haha", "utf8", "haha", false, false}, + {"haha", "ascii", "haha", false, false}, + {"haha", "binary", "haha", false, true}, + {"haha", "bInAry", "haha", false, true}, + {types.NewBinaryLiteralFromUint(0x7e, -1), "BiNarY", "~", true, true}, + {types.NewBinaryLiteralFromUint(0x1e240, -1), "utf8", "", true, false}, + {types.NewBinaryLiteralFromUint(0xe4b8ade696870a, -1), "uTf8", "中文\n", true, false}, } for _, v := range tbl { fc := funcs[ast.Convert] @@ -908,8 +910,12 @@ func TestConvert(t *testing.T) { r, err := evalBuiltinFunc(f, chunk.Row{}) require.NoError(t, err) - require.Equal(t, types.KindString, r.Kind()) - require.Equal(t, v.result, r.GetString()) + if !v.isNull { + require.Equal(t, types.KindString, r.Kind()) + require.Equal(t, v.result, r.GetString()) + } else { + require.True(t, v.isNull) + } } // Test case for getFunction() error @@ -1441,9 +1447,8 @@ func TestChar(t *testing.T) { {"65", 16740, 67.5, "utf8", "AAdD", 0}, // large num {"65", -1, 67.5, nil, "A\xff\xff\xff\xffD", 0}, // nagtive int {"a", -1, 67.5, nil, "\x00\xff\xff\xff\xffD", 0}, // invalid 'a' - // TODO: Uncomment it when issue #29685 be closed - // {"65", -1, 67.5, "utf8", nil, 1}, // with utf8, return nil - // {"a", -1, 67.5, "utf8", nil, 2}, // with utf8, return nil + {"65", -1, 67.5, "utf8", nil, 1}, // with utf8, return nil + {"a", -1, 67.5, "utf8", nil, 2}, // with utf8, return nil // TODO: Uncomment it when gbk be added into charsetInfos // {"1234567", 1234567, 1234567, "gbk", "謬謬謬", 0}, // test char for gbk // {"123456789", 123456789, 123456789, "gbk", nil, 3}, // invalid 123456789 in gbk diff --git a/expression/builtin_string_vec.go b/expression/builtin_string_vec.go index 11933f305fe58..7612836bf21dc 100644 --- a/expression/builtin_string_vec.go +++ b/expression/builtin_string_vec.go @@ -30,7 +30,6 @@ import ( "github.com/pingcap/tidb/types" "github.com/pingcap/tidb/util/chunk" "github.com/pingcap/tidb/util/collate" - "golang.org/x/text/transform" ) func (b *builtinLowerSig) vecEvalString(input *chunk.Chunk, result *chunk.Column) error { @@ -679,17 +678,14 @@ func (b *builtinConvertSig) vecEvalString(input *chunk.Chunk, result *chunk.Colu } // Since charset is already validated and set from getFunction(), there's no // need to get charset from args again. - encoding, _ := charset.Lookup(b.tp.Charset) + enc := charset.NewEncoding(b.tp.Charset) // However, if `b.tp.Charset` is abnormally set to a wrong charset, we still // return with error. - if encoding == nil { + if enc.Name() == "" { return errUnknownCharacterSet.GenWithStackByArgs(b.tp.Charset) } - encoder := encoding.NewEncoder() - decoder := encoding.NewDecoder() isBinaryStr := types.IsBinaryStr(b.args[0].GetType()) isRetBinary := types.IsBinaryStr(b.tp) - enc := charset.NewEncoding(b.tp.Charset) if isRetBinary { enc = charset.NewEncoding(b.args[0].GetType().Charset) } @@ -702,13 +698,12 @@ func (b *builtinConvertSig) vecEvalString(input *chunk.Chunk, result *chunk.Colu } exprI := expr.GetString(i) if isBinaryStr { - target, _, err := transform.String(encoder, exprI) + target, err := enc.DecodeString(exprI) if err != nil { - return err + result.AppendNull() + continue } - // we should convert target into utf8 internal. - exprInternal, _, _ := transform.String(decoder, target) - result.AppendString(exprInternal) + result.AppendString(target) } else { if isRetBinary { str, err := enc.EncodeString(exprI) diff --git a/expression/integration_test.go b/expression/integration_test.go index d25e3ad04195a..f0d949cf5735f 100644 --- a/expression/integration_test.go +++ b/expression/integration_test.go @@ -10580,3 +10580,15 @@ func (s *testIntegrationSuite) TestIssue29244(c *C) { tk.MustExec("set tidb_enable_vectorized_expression = off;") tk.MustQuery("select microsecond(a) from t;").Check(testkit.Rows("123500", "123500")) } + +func (s *testIntegrationSuite) TestIssue29685(c *C) { + tk := testkit.NewTestKit(c, s.store) + tk.MustExec("use test") + tk.MustExec("drop table if exists t") + tk.MustExec("create table t (a binary(5));") + tk.MustExec("insert into t values (0x1e240), ('ABCDE');") + tk.MustExec("set tidb_enable_vectorized_expression = on;") + tk.MustQuery("select convert(t.a using utf8) from t;").Check(testkit.Rows("", "ABCDE")) + tk.MustExec("set tidb_enable_vectorized_expression = off;") + tk.MustQuery("select convert(t.a using utf8) from t;").Check(testkit.Rows("", "ABCDE")) +} diff --git a/parser/charset/encoding.go b/parser/charset/encoding.go index 72bff5b6cf7e9..9f9302a6e5ac0 100644 --- a/parser/charset/encoding.go +++ b/parser/charset/encoding.go @@ -19,6 +19,7 @@ import ( "reflect" "strings" "unicode" + "unicode/utf8" "unsafe" "github.com/cznic/mathutil" @@ -140,8 +141,40 @@ func (e *Encoding) EncodeInternal(dest, src []byte) []byte { return dest } +// validUTF8 checks whether there are illegal utf8 characters in []byte. +func (e *Encoding) validUTF8(src []byte) ([]byte, error) { + resultBytes := src + for len(src) > 0 { + r, size := utf8.DecodeRune(src) + if r == utf8.RuneError && size == 1 { + return resultBytes, e.generateErr(src, characterLengthUTF8(src)) + } + src = src[size:] + } + + return resultBytes, nil +} + +// validUTF8String checks whether there are illegal utf8 characters in string. +func (e *Encoding) validUTF8String(src string) (string, error) { + resultStr := src + for len(src) > 0 { + r, size := utf8.DecodeRuneInString(src) + if r == utf8.RuneError && size == 1 { + srcBytes := []byte(src) + return resultStr, e.generateErr(srcBytes, characterLengthUTF8(srcBytes)) + } + src = src[size:] + } + + return resultStr, nil +} + // Decode convert bytes from a specific charset to utf-8 charset. func (e *Encoding) Decode(dest, src []byte) ([]byte, error) { + if e.name == encodings[CharsetUTF8].name { + return e.validUTF8(src) + } if !e.enabled() { return src, nil } @@ -150,6 +183,9 @@ func (e *Encoding) Decode(dest, src []byte) ([]byte, error) { // DecodeString convert a string from a specific charset to utf-8 charset. func (e *Encoding) DecodeString(src string) (string, error) { + if e.name == encodings[CharsetUTF8].name { + return e.validUTF8String(src) + } if !e.enabled() { return src, nil }