Merge pull request #11203 from ScottPJones/spj/fixvalidutf8

Fix #11141/#10973 and improve performance of is_valid_utf8/is_valid_ascii
JuliaLang · May 15, 2015 · 2f019d7 · 2f019d7
2 parents 888b6b4 + 14e289d
commit 2f019d7
Show file tree

Hide file tree

Showing 2 changed files with 128 additions and 60 deletions.
diff --git a/src/support/utf8.c b/src/support/utf8.c
@@ -615,71 +615,63 @@ size_t u8_printf(const char *fmt, ...)
     return cnt;
 }
 
-/* based on the valid_utf8 routine from the PCRE library by Philip Hazel
+/* Rewritten completely, original code not based on anything else
 
    length is in bytes, since without knowing whether the string is valid
    it's hard to know how many characters there are! */
-int u8_isvalid(const char *str, size_t length)
+int u8_isvalid(const char *str, size_t len)
 {
-    const unsigned char *p, *pend = (unsigned char*)str + length;
-    unsigned char c;
-    int ret = 1; /* ASCII */
-    int ab;
+    const unsigned char *pnt;   // Current pointer in string
+    const unsigned char *pend;  // End of string
+    unsigned char       byt;    // Current byte
+
+    // Empty strings can be considered valid ASCII
+    if (!len) return 1;
+    pnt = (unsigned char *)str;
+    pend = (unsigned char *)str + len;
+    // First scan for non-ASCII characters as fast as possible
+    do {
+        if (*pnt++ & 0x80) goto chkutf8;
+    } while (pnt < pend);
+    return 1;
 
-    for (p = (unsigned char*)str; p < pend; p++) {
-        c = *p;
-        if (c < 128)
-            continue;
-        ret = 2; /* non-ASCII UTF-8 */
-        if ((c & 0xc0) != 0xc0)
+    // Check validity of UTF-8 sequences
+chkutf8:
+    if (pnt == pend) return 0;    // Last byte can't be > 127
+    byt = pnt[-1];
+    // Must be between 0xc2 and 0xf4 inclusive to be valid
+    if (((uint32_t)byt - 0xc2) > (0xf4-0xc2)) return 0;
+    if (byt < 0xe0) {               // 2-byte sequence
+        // Must have valid continuation character
+        if ((*pnt++ & 0xc0) != 0x80) return 0;
+    } else if (byt < 0xf0) {        // 3-byte sequence
+        if ((pnt + 1 >= pend)
+              || (*pnt & 0xc0) != 0x80
+              || (pnt[1] & 0xc0) != 0x80)
             return 0;
-        ab = trailingBytesForUTF8[c];
-        if (length < ab)
-            return 0;
-        length -= ab;
-
-        p++;
-        /* Check top bits in the second byte */
-        if ((*p & 0xc0) != 0x80)
+        // Check for surrogate chars
+        if (byt == 0xed && *pnt > 0x9f) return 0;
+        pnt += 2;
+    } else {                        // 4-byte sequence
+        // Must have 3 valid continuation characters
+        if ((pnt + 2 >= pend)
+              || (*pnt & 0xc0) != 0x80
+              || (pnt[1] & 0xc0) != 0x80
+              || (pnt[2] & 0xc0) != 0x80)
             return 0;
-
-        /* Check for overlong sequences for each different length */
-        switch (ab) {
-            /* Check for xx00 000x */
-        case 1:
-            if ((c & 0x3e) == 0) return 0;
-            continue;   /* We know there aren't any more bytes to check */
-
-            /* Check for 1110 0000, xx0x xxxx */
-        case 2:
-            if (c == 0xe0 && (*p & 0x20) == 0) return 0;
-            break;
-
-            /* Check for 1111 0000, xx00 xxxx */
-        case 3:
-            if (c == 0xf0 && (*p & 0x30) == 0) return 0;
-            break;
-
-            /* Check for 1111 1000, xx00 0xxx */
-        case 4:
-            if (c == 0xf8 && (*p & 0x38) == 0) return 0;
-            break;
-
-            /* Check for leading 0xfe or 0xff,
-               and then for 1111 1100, xx00 00xx */
-        case 5:
-            if (c == 0xfe || c == 0xff ||
-                (c == 0xfc && (*p & 0x3c) == 0)) return 0;
-            break;
-        }
-
-        /* Check for valid bytes after the 2nd, if any; all must start 10 */
-        while (--ab > 0) {
-            if ((*(++p) & 0xc0) != 0x80) return 0;
+        // Make sure in correct range (0x10000 - 0x10ffff)
+        if (byt == 0xf0) {
+            if (*pnt < 0x90) return 0;
+        } else if (byt == 0xf4) {
+            if (*pnt > 0x8f) return 0;
         }
+        pnt += 3;
     }
-
-    return ret;
+    // Find next non-ASCII characters as fast as possible
+    while (pnt < pend) {
+        if (*pnt++ & 0x80) goto chkutf8;
+    }
+    return 2;   // Valid UTF-8
 }
 
 int u8_reverse(char *dest, char *src, size_t len)

diff --git a/test/strings.jl b/test/strings.jl
@@ -1030,9 +1030,10 @@ end
 let
     # make symbol with invalid char
     sym = symbol(Char(0xdcdb))
-    @test string(sym) == "\udcdb"
+    @test string(sym) == string(Char(0xdcdb))
     @test expand(sym) === sym
-    @test parse("\udcdb = 1",1,raise=false)[1] == Expr(:error, "invalid character \"\udcdb\"")
+    res = string(parse(string(Char(0xdcdb)," = 1"),1,raise=false)[1])
+    @test res == """\$(Expr(:error, "invalid character \\\"\\udcdb\\\"\"))"""
 end
 
 @test symbol("asdf") === :asdf
@@ -1284,17 +1285,92 @@ end
 @test is_valid_ascii("Σ_not_valid_ascii") == false
 @test is_valid_char('a') == true
 @test is_valid_char('\x00') == true
-@test is_valid_char('\ud800') == false
+@test is_valid_char(0xd800) == false
 
 @test is_valid_utf16(utf16("a")) == true
-@test is_valid_utf16(utf16("\ud800")) == false
+@test is_valid_utf16(UInt16[0xd800,0]) == false
 # TODO is_valid_utf8
 
 # Issue #11140
 @test is_valid_utf32(utf32("a")) == true
 @test is_valid_utf32(utf32("\x00")) == true
 @test is_valid_utf32(UInt32[0xd800,0]) == false
 
+# Issue #11203
+@test is_valid_ascii(UInt8[]) == true
+@test is_valid_utf8(UInt8[]) == true
+@test is_valid_utf16(UInt16[]) == true
+@test is_valid_utf32(UInt32[]) == true
+
+# Check UTF-8 characters
+# Check ASCII range (true),
+# then single continuation bytes and lead bytes with no following continuation bytes (false)
+for (rng,flg) in ((0:0x7f, true), (0x80:0xff, false))
+    for byt in rng
+        @test is_valid_utf8(UInt8[byt]) == flg
+    end
+end
+# Check overlong lead bytes for 2-character sequences (false)
+for byt = 0xc0:0xc1
+    @test is_valid_utf8(UInt8[byt,0x80]) == false
+end
+# Check valid lead-in to two-byte sequences (true)
+for byt = 0xc2:0xdf
+    for (rng,flg) in ((0x00:0x7f, false), (0x80:0xbf, true), (0xc0:0xff, false))
+        for cont in rng
+            @test is_valid_utf8(UInt8[byt, cont]) == flg
+        end
+    end
+end
+# Check three-byte sequences
+for r1 in (0xe0:0xec, 0xee:0xef)
+    for byt = r1
+        # Check for short sequence
+        @test is_valid_utf8(UInt8[byt]) == false
+        for (rng,flg) in ((0x00:0x7f, false), (0x80:0xbf, true), (0xc0:0xff, false))
+            for cont in rng
+                @test is_valid_utf8(UInt8[byt, cont]) == false
+                @test is_valid_utf8(UInt8[byt, cont, 0x80]) == flg
+            end
+        end
+    end
+end
+# Check hangul characters (0xd000-0xd7ff) hangul
+# Check for short sequence, or start of surrogate pair
+for (rng,flg) in ((0x00:0x7f, false), (0x80:0x9f, true), (0xa0:0xff, false))
+    for cont in rng
+        @test is_valid_utf8(UInt8[0xed, cont]) == false
+        @test is_valid_utf8(UInt8[0xed, cont, 0x80]) == flg
+    end
+end
+# Check valid four-byte sequences
+for byt = 0xf0:0xf4
+    if (byt == 0xf0)
+        r0 = ((0x00:0x8f, false), (0x90:0xbf, true), (0xc0:0xff, false))
+    elseif byt == 0xf4
+        r0 = ((0x00:0x7f, false), (0x80:0x8f, true), (0x90:0xff, false))
+    else
+        r0 = ((0x00:0x7f, false), (0x80:0xbf, true), (0xc0:0xff, false))
+    end
+    for (rng,flg) in r0
+        for cont in rng
+            @test is_valid_utf8(UInt8[byt, cont]) == false
+            @test is_valid_utf8(UInt8[byt, cont, 0x80]) == false
+            @test is_valid_utf8(UInt8[byt, cont, 0x80, 0x80]) == flg
+        end
+    end
+end
+# Check five-byte sequences, should be invalid
+for byt = 0xf8:0xfb
+    @test is_valid_utf8(UInt8[byt, 0x80, 0x80, 0x80, 0x80]) == false
+end
+# Check six-byte sequences, should be invalid
+for byt = 0xfc:0xfd
+    @test is_valid_utf8(UInt8[byt, 0x80, 0x80, 0x80, 0x80, 0x80]) == false
+end
+# Check seven-byte sequences, should be invalid
+@test is_valid_utf8(UInt8[0xfe, 0x80, 0x80, 0x80, 0x80, 0x80]) == false
+
 # This caused JuliaLang/JSON.jl#82
 @test first('\x00':'\x7f') === '\x00'
 @test last('\x00':'\x7f') === '\x7f'