From 92f8a9692d1851427fdaa2ca63512bd9de871d70 Mon Sep 17 00:00:00 2001 From: Simon Byrne Date: Sat, 3 Nov 2018 12:46:47 -0700 Subject: [PATCH] Fix isvalid for 3-byte overlong encoded UTF-8 sequences (#29908) --- src/support/utf8.c | 2 ++ test/strings/basic.jl | 12 ++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/support/utf8.c b/src/support/utf8.c index 28c779b73b58ba..ea7e970be6b512 100644 --- a/src/support/utf8.c +++ b/src/support/utf8.c @@ -570,6 +570,8 @@ int u8_isvalid(const char *str, size_t len) return 0; // Check for surrogate chars if (byt == 0xed && *pnt > 0x9f) return 0; + // Check for overlong encoding + if (byt == 0xe0 && *pnt < 0xa0) return 0; pnt += 2; } else { // 4-byte sequence // Must have 3 valid continuation characters diff --git a/test/strings/basic.jl b/test/strings/basic.jl index fe3157e83fd740..337e7e3231a4f0 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -467,9 +467,17 @@ end end end end + # Check for short three-byte sequences + @test isvalid(String, UInt8[0xe0]) == false + for (rng, flg) in ((0x00:0x9f, false), (0xa0:0xbf, true), (0xc0:0xff, false)) + for cont in rng + @test isvalid(String, UInt8[0xe0, cont]) == false + @test isvalid(String, UInt8[0xe0, cont, 0x80]) == flg + end + end # Check three-byte sequences - for r1 in (0xe0:0xec, 0xee:0xef) - for byt = r1 + for r1 in (0xe1:0xec, 0xee:0xef) + for byt in r1 # Check for short sequence @test isvalid(String, UInt8[byt]) == false for (rng,flg) in ((0x00:0x7f, false), (0x80:0xbf, true), (0xc0:0xff, false))