Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

expression: add check for utf8 charset when decode to avoid invalid character #29765

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified cmd/explaintest/r/new_character_set_builtin.result
Binary file not shown.
7 changes: 7 additions & 0 deletions cmd/explaintest/t/new_character_set_builtin.test
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,13 @@ select hex(convert('ㅂ' using gbk)), convert('ㅂ' using gbk);
select convert(a using binary) from t;
select convert(convert('中文' using gbk) using binary), convert('中文' using binary);
select convert(convert('ㅂ' using gbk) using binary), convert('ㅂ' using binary);
drop table if exists t;
create table t(a binary(10));
insert into t values (0xe240), (0x01e240);
set @@tidb_enable_vectorized_expression = true;
select hex(convert(a using gbk)), convert(a using gbk) from t;
set @@tidb_enable_vectorized_expression = false;
select hex(convert(a using gbk)), convert(a using gbk) from t;

-- test for builtin function md5()
drop table if exists t;
Expand Down
16 changes: 6 additions & 10 deletions expression/builtin_string.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ import (
"github.com/pingcap/tidb/util/logutil"
"github.com/pingcap/tipb/go-tipb"
"go.uber.org/zap"
"golang.org/x/text/transform"
)

var (
Expand Down Expand Up @@ -1144,29 +1143,26 @@ func (b *builtinConvertSig) evalString(row chunk.Row) (string, bool, error) {

// Since charset is already validated and set from getFunction(), there's no
// need to get charset from args again.
encoding, _ := charset.Lookup(b.tp.Charset)
enc := charset.NewEncoding(b.tp.Charset)
// However, if `b.tp.Charset` is abnormally set to a wrong charset, we still
// return with error.
if encoding == nil {
if enc.Name() == "" {
return "", true, errUnknownCharacterSet.GenWithStackByArgs(b.tp.Charset)
}
// if expr is binary string and convert meet error, we should return NULL.
if types.IsBinaryStr(b.args[0].GetType()) {
target, _, err := transform.String(encoding.NewEncoder(), expr)
target, err := enc.DecodeString(expr)
if err != nil {
return "", true, err
b.ctx.GetSessionVars().StmtCtx.AppendWarning(err)
return "", true, nil
}

// we should convert target into utf8 internal.
exprInternal, _, _ := transform.String(encoding.NewDecoder(), target)
return exprInternal, false, nil
return target, false, nil
}
if types.IsBinaryStr(b.tp) {
enc := charset.NewEncoding(b.args[0].GetType().Charset)
expr, err = enc.EncodeString(expr)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it ok to return err directly?

Copy link
Contributor Author

@Defined2014 Defined2014 Nov 18, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's fine. But not sure. PTAL @xiongjiwei

return expr, false, err
}
enc := charset.NewEncoding(b.tp.Charset)
return string(enc.EncodeInternal(nil, []byte(expr))), false, nil
}

Expand Down
27 changes: 16 additions & 11 deletions expression/builtin_string_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -885,14 +885,16 @@ func TestConvert(t *testing.T) {
str interface{}
cs string
result string
isNull bool
hasBinaryFlag bool
}{
{"haha", "utf8", "haha", false},
{"haha", "ascii", "haha", false},
{"haha", "binary", "haha", true},
{"haha", "bInAry", "haha", true},
{types.NewBinaryLiteralFromUint(0x7e, -1), "BiNarY", "~", true},
{types.NewBinaryLiteralFromUint(0xe4b8ade696870a, -1), "uTf8", "中文\n", false},
{"haha", "utf8", "haha", false, false},
{"haha", "ascii", "haha", false, false},
{"haha", "binary", "haha", false, true},
{"haha", "bInAry", "haha", false, true},
{types.NewBinaryLiteralFromUint(0x7e, -1), "BiNarY", "~", true, true},
{types.NewBinaryLiteralFromUint(0x1e240, -1), "utf8", "", true, false},
{types.NewBinaryLiteralFromUint(0xe4b8ade696870a, -1), "uTf8", "中文\n", true, false},
}
for _, v := range tbl {
fc := funcs[ast.Convert]
Expand All @@ -908,8 +910,12 @@ func TestConvert(t *testing.T) {

r, err := evalBuiltinFunc(f, chunk.Row{})
require.NoError(t, err)
require.Equal(t, types.KindString, r.Kind())
require.Equal(t, v.result, r.GetString())
if !v.isNull {
require.Equal(t, types.KindString, r.Kind())
require.Equal(t, v.result, r.GetString())
} else {
require.True(t, v.isNull)
}
}

// Test case for getFunction() error
Expand Down Expand Up @@ -1441,9 +1447,8 @@ func TestChar(t *testing.T) {
{"65", 16740, 67.5, "utf8", "AAdD", 0}, // large num
{"65", -1, 67.5, nil, "A\xff\xff\xff\xffD", 0}, // nagtive int
{"a", -1, 67.5, nil, "\x00\xff\xff\xff\xffD", 0}, // invalid 'a'
// TODO: Uncomment it when issue #29685 be closed
// {"65", -1, 67.5, "utf8", nil, 1}, // with utf8, return nil
// {"a", -1, 67.5, "utf8", nil, 2}, // with utf8, return nil
{"65", -1, 67.5, "utf8", nil, 1}, // with utf8, return nil
{"a", -1, 67.5, "utf8", nil, 2}, // with utf8, return nil
// TODO: Uncomment it when gbk be added into charsetInfos
// {"1234567", 1234567, 1234567, "gbk", "謬謬謬", 0}, // test char for gbk
// {"123456789", 123456789, 123456789, "gbk", nil, 3}, // invalid 123456789 in gbk
Expand Down
17 changes: 6 additions & 11 deletions expression/builtin_string_vec.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ import (
"github.com/pingcap/tidb/types"
"github.com/pingcap/tidb/util/chunk"
"github.com/pingcap/tidb/util/collate"
"golang.org/x/text/transform"
)

func (b *builtinLowerSig) vecEvalString(input *chunk.Chunk, result *chunk.Column) error {
Expand Down Expand Up @@ -679,17 +678,14 @@ func (b *builtinConvertSig) vecEvalString(input *chunk.Chunk, result *chunk.Colu
}
// Since charset is already validated and set from getFunction(), there's no
// need to get charset from args again.
encoding, _ := charset.Lookup(b.tp.Charset)
enc := charset.NewEncoding(b.tp.Charset)
// However, if `b.tp.Charset` is abnormally set to a wrong charset, we still
// return with error.
if encoding == nil {
if enc.Name() == "" {
return errUnknownCharacterSet.GenWithStackByArgs(b.tp.Charset)
}
encoder := encoding.NewEncoder()
decoder := encoding.NewDecoder()
isBinaryStr := types.IsBinaryStr(b.args[0].GetType())
isRetBinary := types.IsBinaryStr(b.tp)
enc := charset.NewEncoding(b.tp.Charset)
if isRetBinary {
enc = charset.NewEncoding(b.args[0].GetType().Charset)
}
Expand All @@ -702,13 +698,12 @@ func (b *builtinConvertSig) vecEvalString(input *chunk.Chunk, result *chunk.Colu
}
exprI := expr.GetString(i)
if isBinaryStr {
target, _, err := transform.String(encoder, exprI)
target, err := enc.DecodeString(exprI)
if err != nil {
return err
result.AppendNull()
continue
}
// we should convert target into utf8 internal.
exprInternal, _, _ := transform.String(decoder, target)
result.AppendString(exprInternal)
result.AppendString(target)
} else {
if isRetBinary {
str, err := enc.EncodeString(exprI)
Expand Down
12 changes: 12 additions & 0 deletions expression/integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10580,3 +10580,15 @@ func (s *testIntegrationSuite) TestIssue29244(c *C) {
tk.MustExec("set tidb_enable_vectorized_expression = off;")
tk.MustQuery("select microsecond(a) from t;").Check(testkit.Rows("123500", "123500"))
}

func (s *testIntegrationSuite) TestIssue29685(c *C) {
tk := testkit.NewTestKit(c, s.store)
tk.MustExec("use test")
tk.MustExec("drop table if exists t")
tk.MustExec("create table t (a binary(5));")
tk.MustExec("insert into t values (0x1e240), ('ABCDE');")
tk.MustExec("set tidb_enable_vectorized_expression = on;")
tk.MustQuery("select convert(t.a using utf8) from t;").Check(testkit.Rows("<nil>", "ABCDE"))
tk.MustExec("set tidb_enable_vectorized_expression = off;")
tk.MustQuery("select convert(t.a using utf8) from t;").Check(testkit.Rows("<nil>", "ABCDE"))
}
36 changes: 36 additions & 0 deletions parser/charset/encoding.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"reflect"
"strings"
"unicode"
"unicode/utf8"
"unsafe"

"github.com/cznic/mathutil"
Expand Down Expand Up @@ -140,8 +141,40 @@ func (e *Encoding) EncodeInternal(dest, src []byte) []byte {
return dest
}

// validUTF8 checks whether there are illegal utf8 characters in []byte.
func (e *Encoding) validUTF8(src []byte) ([]byte, error) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there is utf8.Valid(), we can avoid write it by ourself

Copy link
Contributor Author

@Defined2014 Defined2014 Nov 22, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Implement this func by ourself is for compatible err msg.

resultBytes := src
for len(src) > 0 {
r, size := utf8.DecodeRune(src)
if r == utf8.RuneError && size == 1 {
return resultBytes, e.generateErr(src, characterLengthUTF8(src))
}
src = src[size:]
}

return resultBytes, nil
}

// validUTF8String checks whether there are illegal utf8 characters in string.
func (e *Encoding) validUTF8String(src string) (string, error) {
resultStr := src
for len(src) > 0 {
r, size := utf8.DecodeRuneInString(src)
if r == utf8.RuneError && size == 1 {
srcBytes := []byte(src)
return resultStr, e.generateErr(srcBytes, characterLengthUTF8(srcBytes))
}
src = src[size:]
}

return resultStr, nil
}

// Decode convert bytes from a specific charset to utf-8 charset.
func (e *Encoding) Decode(dest, src []byte) ([]byte, error) {
if e.name == encodings[CharsetUTF8].name {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tangenta PTAL, Do we need to check the utf8 here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason is #29685

return e.validUTF8(src)
}
if !e.enabled() {
return src, nil
}
Expand All @@ -150,6 +183,9 @@ func (e *Encoding) Decode(dest, src []byte) ([]byte, error) {

// DecodeString convert a string from a specific charset to utf-8 charset.
func (e *Encoding) DecodeString(src string) (string, error) {
if e.name == encodings[CharsetUTF8].name {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WIll an extra check cause a performance regression?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DecodeString will called by the lexer.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it will. Maybe #30288 is a better way to do this. I will disscuss with @tangenta later.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tangenta @Defined2014 Do you still want to go on with this issue, or leave it with #30288?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I will close it.

return e.validUTF8String(src)
}
if !e.enabled() {
return src, nil
}
Expand Down