From 52766df7979047c7668cf67eeaf6c1cabb96238e Mon Sep 17 00:00:00 2001 From: Jason Mo Date: Fri, 12 Nov 2021 16:40:59 +0800 Subject: [PATCH 1/6] add check for utf8 charset when decode --- expression/builtin_string.go | 14 ++++++++------ parser/charset/encoding.go | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 6 deletions(-) diff --git a/expression/builtin_string.go b/expression/builtin_string.go index bf948605045b2..14d3eb4cadd41 100644 --- a/expression/builtin_string.go +++ b/expression/builtin_string.go @@ -41,7 +41,6 @@ import ( "github.com/pingcap/tidb/util/logutil" "github.com/pingcap/tipb/go-tipb" "go.uber.org/zap" - "golang.org/x/text/transform" ) var ( @@ -1144,21 +1143,25 @@ func (b *builtinConvertSig) evalString(row chunk.Row) (string, bool, error) { // Since charset is already validated and set from getFunction(), there's no // need to get charset from args again. - encoding, _ := charset.Lookup(b.tp.Charset) + enc := charset.NewEncoding(b.tp.Charset) // However, if `b.tp.Charset` is abnormally set to a wrong charset, we still // return with error. - if encoding == nil { + if enc.Name() == "" { return "", true, errUnknownCharacterSet.GenWithStackByArgs(b.tp.Charset) } // if expr is binary string and convert meet error, we should return NULL. if types.IsBinaryStr(b.args[0].GetType()) { - target, _, err := transform.String(encoding.NewEncoder(), expr) + target, err := enc.EncodeString(expr) if err != nil { return "", true, err } // we should convert target into utf8 internal. - exprInternal, _, _ := transform.String(encoding.NewDecoder(), target) + exprInternal, err := enc.DecodeString(target) + if err != nil { + b.ctx.GetSessionVars().StmtCtx.AppendWarning(err) + return "", true, nil + } return exprInternal, false, nil } if types.IsBinaryStr(b.tp) { @@ -1166,7 +1169,6 @@ func (b *builtinConvertSig) evalString(row chunk.Row) (string, bool, error) { expr, err = enc.EncodeString(expr) return expr, false, err } - enc := charset.NewEncoding(b.tp.Charset) return string(enc.EncodeInternal(nil, []byte(expr))), false, nil } diff --git a/parser/charset/encoding.go b/parser/charset/encoding.go index 72bff5b6cf7e9..6394d0a63f497 100644 --- a/parser/charset/encoding.go +++ b/parser/charset/encoding.go @@ -19,6 +19,7 @@ import ( "reflect" "strings" "unicode" + "unicode/utf8" "unsafe" "github.com/cznic/mathutil" @@ -140,8 +141,39 @@ func (e *Encoding) EncodeInternal(dest, src []byte) []byte { return dest } +// validUTF8 checks whether there are illegal utf8 characters in []byte. +func (e *Encoding) validUTF8(src []byte) ([]byte, error) { + resultBytes := src + for len(src) > 0 { + r, size := utf8.DecodeRune(src) + if r == utf8.RuneError && size == 1 { + return resultBytes, e.generateErr(src, 3) + } + src = src[size:] + } + + return resultBytes, nil +} + +// validUTF8String checks whether there are illegal utf8 characters in string. +func (e *Encoding) validUTF8String(src string) (string, error) { + resultStr := src + for len(src) > 0 { + r, size := utf8.DecodeRuneInString(src) + if r == utf8.RuneError && size == 1 { + return resultStr, e.generateErr([]byte(src), 3) + } + src = src[size:] + } + + return resultStr, nil +} + // Decode convert bytes from a specific charset to utf-8 charset. func (e *Encoding) Decode(dest, src []byte) ([]byte, error) { + if e.name == "utf-8" { + return e.validUTF8(src) + } if !e.enabled() { return src, nil } @@ -150,6 +182,9 @@ func (e *Encoding) Decode(dest, src []byte) ([]byte, error) { // DecodeString convert a string from a specific charset to utf-8 charset. func (e *Encoding) DecodeString(src string) (string, error) { + if e.name == "utf-8" { + return e.validUTF8String(src) + } if !e.enabled() { return src, nil } From 8a8d4c64626b6bfc596019a9d1df4cbebbe8fa0b Mon Sep 17 00:00:00 2001 From: Jason Mo Date: Fri, 12 Nov 2021 18:09:25 +0800 Subject: [PATCH 2/6] add test --- expression/builtin_string.go | 3 ++- expression/builtin_string_test.go | 27 ++++++++++++++++----------- parser/charset/encoding.go | 5 +++-- 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/expression/builtin_string.go b/expression/builtin_string.go index 14d3eb4cadd41..2214644b676eb 100644 --- a/expression/builtin_string.go +++ b/expression/builtin_string.go @@ -1153,7 +1153,8 @@ func (b *builtinConvertSig) evalString(row chunk.Row) (string, bool, error) { if types.IsBinaryStr(b.args[0].GetType()) { target, err := enc.EncodeString(expr) if err != nil { - return "", true, err + b.ctx.GetSessionVars().StmtCtx.AppendWarning(err) + return "", true, nil } // we should convert target into utf8 internal. diff --git a/expression/builtin_string_test.go b/expression/builtin_string_test.go index 28d98d0215091..22f3ea294d129 100644 --- a/expression/builtin_string_test.go +++ b/expression/builtin_string_test.go @@ -885,14 +885,16 @@ func TestConvert(t *testing.T) { str interface{} cs string result string + isNull bool hasBinaryFlag bool }{ - {"haha", "utf8", "haha", false}, - {"haha", "ascii", "haha", false}, - {"haha", "binary", "haha", true}, - {"haha", "bInAry", "haha", true}, - {types.NewBinaryLiteralFromUint(0x7e, -1), "BiNarY", "~", true}, - {types.NewBinaryLiteralFromUint(0xe4b8ade696870a, -1), "uTf8", "中文\n", false}, + {"haha", "utf8", "haha", false, false}, + {"haha", "ascii", "haha", false, false}, + {"haha", "binary", "haha", false, true}, + {"haha", "bInAry", "haha", false, true}, + {types.NewBinaryLiteralFromUint(0x7e, -1), "BiNarY", "~", true, true}, + {types.NewBinaryLiteralFromUint(0x1e240, -1), "utf8", "", true, false}, + {types.NewBinaryLiteralFromUint(0xe4b8ade696870a, -1), "uTf8", "中文\n", true, false}, } for _, v := range tbl { fc := funcs[ast.Convert] @@ -908,8 +910,12 @@ func TestConvert(t *testing.T) { r, err := evalBuiltinFunc(f, chunk.Row{}) require.NoError(t, err) - require.Equal(t, types.KindString, r.Kind()) - require.Equal(t, v.result, r.GetString()) + if !v.isNull { + require.Equal(t, types.KindString, r.Kind()) + require.Equal(t, v.result, r.GetString()) + } else { + require.True(t, v.isNull) + } } // Test case for getFunction() error @@ -1441,9 +1447,8 @@ func TestChar(t *testing.T) { {"65", 16740, 67.5, "utf8", "AAdD", 0}, // large num {"65", -1, 67.5, nil, "A\xff\xff\xff\xffD", 0}, // nagtive int {"a", -1, 67.5, nil, "\x00\xff\xff\xff\xffD", 0}, // invalid 'a' - // TODO: Uncomment it when issue #29685 be closed - // {"65", -1, 67.5, "utf8", nil, 1}, // with utf8, return nil - // {"a", -1, 67.5, "utf8", nil, 2}, // with utf8, return nil + {"65", -1, 67.5, "utf8", nil, 1}, // with utf8, return nil + {"a", -1, 67.5, "utf8", nil, 2}, // with utf8, return nil // TODO: Uncomment it when gbk be added into charsetInfos // {"1234567", 1234567, 1234567, "gbk", "謬謬謬", 0}, // test char for gbk // {"123456789", 123456789, 123456789, "gbk", nil, 3}, // invalid 123456789 in gbk diff --git a/parser/charset/encoding.go b/parser/charset/encoding.go index 6394d0a63f497..7b5cb248cbe6e 100644 --- a/parser/charset/encoding.go +++ b/parser/charset/encoding.go @@ -147,7 +147,7 @@ func (e *Encoding) validUTF8(src []byte) ([]byte, error) { for len(src) > 0 { r, size := utf8.DecodeRune(src) if r == utf8.RuneError && size == 1 { - return resultBytes, e.generateErr(src, 3) + return resultBytes, e.generateErr(src, characterLengthUTF8(src)) } src = src[size:] } @@ -161,7 +161,8 @@ func (e *Encoding) validUTF8String(src string) (string, error) { for len(src) > 0 { r, size := utf8.DecodeRuneInString(src) if r == utf8.RuneError && size == 1 { - return resultStr, e.generateErr([]byte(src), 3) + srcBytes := []byte(src) + return resultStr, e.generateErr(srcBytes, characterLengthUTF8(srcBytes)) } src = src[size:] } From 98ed2dc845da17a967246b32d518fbfa95b6f36a Mon Sep 17 00:00:00 2001 From: Jason Mo Date: Fri, 12 Nov 2021 18:43:22 +0800 Subject: [PATCH 3/6] add intergration test --- expression/builtin_string_vec.go | 22 ++++++++++++---------- expression/integration_test.go | 12 ++++++++++++ 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/expression/builtin_string_vec.go b/expression/builtin_string_vec.go index 11933f305fe58..59ff75ee81e7a 100644 --- a/expression/builtin_string_vec.go +++ b/expression/builtin_string_vec.go @@ -30,7 +30,6 @@ import ( "github.com/pingcap/tidb/types" "github.com/pingcap/tidb/util/chunk" "github.com/pingcap/tidb/util/collate" - "golang.org/x/text/transform" ) func (b *builtinLowerSig) vecEvalString(input *chunk.Chunk, result *chunk.Column) error { @@ -679,17 +678,14 @@ func (b *builtinConvertSig) vecEvalString(input *chunk.Chunk, result *chunk.Colu } // Since charset is already validated and set from getFunction(), there's no // need to get charset from args again. - encoding, _ := charset.Lookup(b.tp.Charset) + enc := charset.NewEncoding(b.tp.Charset) // However, if `b.tp.Charset` is abnormally set to a wrong charset, we still // return with error. - if encoding == nil { + if enc.Name() == "" { return errUnknownCharacterSet.GenWithStackByArgs(b.tp.Charset) } - encoder := encoding.NewEncoder() - decoder := encoding.NewDecoder() isBinaryStr := types.IsBinaryStr(b.args[0].GetType()) isRetBinary := types.IsBinaryStr(b.tp) - enc := charset.NewEncoding(b.tp.Charset) if isRetBinary { enc = charset.NewEncoding(b.args[0].GetType().Charset) } @@ -702,13 +698,19 @@ func (b *builtinConvertSig) vecEvalString(input *chunk.Chunk, result *chunk.Colu } exprI := expr.GetString(i) if isBinaryStr { - target, _, err := transform.String(encoder, exprI) + target, err := enc.EncodeString(exprI) if err != nil { - return err + result.AppendNull() + continue } // we should convert target into utf8 internal. - exprInternal, _, _ := transform.String(decoder, target) - result.AppendString(exprInternal) + exprInternal, err := enc.DecodeString(target) + if err != nil { + result.AppendNull() + continue + } else { + result.AppendString(exprInternal) + } } else { if isRetBinary { str, err := enc.EncodeString(exprI) diff --git a/expression/integration_test.go b/expression/integration_test.go index d25e3ad04195a..f0d949cf5735f 100644 --- a/expression/integration_test.go +++ b/expression/integration_test.go @@ -10580,3 +10580,15 @@ func (s *testIntegrationSuite) TestIssue29244(c *C) { tk.MustExec("set tidb_enable_vectorized_expression = off;") tk.MustQuery("select microsecond(a) from t;").Check(testkit.Rows("123500", "123500")) } + +func (s *testIntegrationSuite) TestIssue29685(c *C) { + tk := testkit.NewTestKit(c, s.store) + tk.MustExec("use test") + tk.MustExec("drop table if exists t") + tk.MustExec("create table t (a binary(5));") + tk.MustExec("insert into t values (0x1e240), ('ABCDE');") + tk.MustExec("set tidb_enable_vectorized_expression = on;") + tk.MustQuery("select convert(t.a using utf8) from t;").Check(testkit.Rows("", "ABCDE")) + tk.MustExec("set tidb_enable_vectorized_expression = off;") + tk.MustQuery("select convert(t.a using utf8) from t;").Check(testkit.Rows("", "ABCDE")) +} From 59b3fc4f2e5cc51a8831b045e4b6c3c2b581bff9 Mon Sep 17 00:00:00 2001 From: Jason Mo Date: Fri, 12 Nov 2021 19:15:22 +0800 Subject: [PATCH 4/6] fix convert using gbk --- .../r/new_character_set_builtin.result | Bin 14272 -> 14820 bytes .../t/new_character_set_builtin.test | 7 +++++++ expression/builtin_string.go | 11 ++--------- expression/builtin_string_vec.go | 10 ++-------- 4 files changed, 11 insertions(+), 17 deletions(-) diff --git a/cmd/explaintest/r/new_character_set_builtin.result b/cmd/explaintest/r/new_character_set_builtin.result index b4944f30576116f95e88376380865ac76327de61..8772baab071112390efa5e548283e982be165863 100644 GIT binary patch delta 250 zcmX?*|D7-A~nWJIU{nq)M2xu Olix_zZT?~Wh7|yRd>!Qg diff --git a/cmd/explaintest/t/new_character_set_builtin.test b/cmd/explaintest/t/new_character_set_builtin.test index 0c6fefac7919d..3726951591461 100644 --- a/cmd/explaintest/t/new_character_set_builtin.test +++ b/cmd/explaintest/t/new_character_set_builtin.test @@ -57,6 +57,13 @@ select hex(convert('ㅂ' using gbk)), convert('ㅂ' using gbk); select convert(a using binary) from t; select convert(convert('中文' using gbk) using binary), convert('中文' using binary); select convert(convert('ㅂ' using gbk) using binary), convert('ㅂ' using binary); +drop table if exists t; +create table t(a binary(10)); +insert into t values (0xe240), (0x01e240); +set @@tidb_enable_vectorized_expression = true; +select hex(convert(a using gbk)), convert(a using gbk) from t; +set @@tidb_enable_vectorized_expression = false; +select hex(convert(a using gbk)), convert(a using gbk) from t; -- test for builtin function md5() drop table if exists t; diff --git a/expression/builtin_string.go b/expression/builtin_string.go index 2214644b676eb..86f426be2faaa 100644 --- a/expression/builtin_string.go +++ b/expression/builtin_string.go @@ -1151,19 +1151,12 @@ func (b *builtinConvertSig) evalString(row chunk.Row) (string, bool, error) { } // if expr is binary string and convert meet error, we should return NULL. if types.IsBinaryStr(b.args[0].GetType()) { - target, err := enc.EncodeString(expr) + target, err := enc.DecodeString(expr) if err != nil { b.ctx.GetSessionVars().StmtCtx.AppendWarning(err) return "", true, nil } - - // we should convert target into utf8 internal. - exprInternal, err := enc.DecodeString(target) - if err != nil { - b.ctx.GetSessionVars().StmtCtx.AppendWarning(err) - return "", true, nil - } - return exprInternal, false, nil + return target, false, nil } if types.IsBinaryStr(b.tp) { enc := charset.NewEncoding(b.args[0].GetType().Charset) diff --git a/expression/builtin_string_vec.go b/expression/builtin_string_vec.go index 59ff75ee81e7a..d53ed5aeb41a2 100644 --- a/expression/builtin_string_vec.go +++ b/expression/builtin_string_vec.go @@ -698,18 +698,12 @@ func (b *builtinConvertSig) vecEvalString(input *chunk.Chunk, result *chunk.Colu } exprI := expr.GetString(i) if isBinaryStr { - target, err := enc.EncodeString(exprI) - if err != nil { - result.AppendNull() - continue - } - // we should convert target into utf8 internal. - exprInternal, err := enc.DecodeString(target) + target, err := enc.DecodeString(exprI) if err != nil { result.AppendNull() continue } else { - result.AppendString(exprInternal) + result.AppendString(target) } } else { if isRetBinary { From 9e96ec40e7158837e1cf16d9d61777dc256aa43f Mon Sep 17 00:00:00 2001 From: Jason Mo Date: Mon, 15 Nov 2021 13:43:41 +0800 Subject: [PATCH 5/6] fix superfluous-else --- expression/builtin_string_vec.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/expression/builtin_string_vec.go b/expression/builtin_string_vec.go index d53ed5aeb41a2..7612836bf21dc 100644 --- a/expression/builtin_string_vec.go +++ b/expression/builtin_string_vec.go @@ -702,9 +702,8 @@ func (b *builtinConvertSig) vecEvalString(input *chunk.Chunk, result *chunk.Colu if err != nil { result.AppendNull() continue - } else { - result.AppendString(target) } + result.AppendString(target) } else { if isRetBinary { str, err := enc.EncodeString(exprI) From 2d64254b20104b64296a95f729ab50af09485ff5 Mon Sep 17 00:00:00 2001 From: Jason Mo Date: Thu, 18 Nov 2021 12:09:55 +0800 Subject: [PATCH 6/6] follow comments --- parser/charset/encoding.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parser/charset/encoding.go b/parser/charset/encoding.go index 7b5cb248cbe6e..9f9302a6e5ac0 100644 --- a/parser/charset/encoding.go +++ b/parser/charset/encoding.go @@ -172,7 +172,7 @@ func (e *Encoding) validUTF8String(src string) (string, error) { // Decode convert bytes from a specific charset to utf-8 charset. func (e *Encoding) Decode(dest, src []byte) ([]byte, error) { - if e.name == "utf-8" { + if e.name == encodings[CharsetUTF8].name { return e.validUTF8(src) } if !e.enabled() { @@ -183,7 +183,7 @@ func (e *Encoding) Decode(dest, src []byte) ([]byte, error) { // DecodeString convert a string from a specific charset to utf-8 charset. func (e *Encoding) DecodeString(src string) (string, error) { - if e.name == "utf-8" { + if e.name == encodings[CharsetUTF8].name { return e.validUTF8String(src) } if !e.enabled() {