Fix JuliaLang#11141 and improve performance greatly

ScottPJones · May 8, 2015 · 88a8071 · 88a8071
1 parent dfc2cea
commit 88a8071
Show file tree

Hide file tree

Showing 2 changed files with 81 additions and 21 deletions.
diff --git a/src/support/utf8.c b/src/support/utf8.c
@@ -619,7 +619,7 @@ size_t u8_printf(const char *fmt, ...)
 
    length is in bytes, since without knowing whether the string is valid
    it's hard to know how many characters there are! */
-int u8_isvalid(const char *str, size_t length)
+DLLEXPORT int u8_isvalid_old(const char *str, size_t length)
 {
     const unsigned char *p, *pend = (unsigned char*)str + length;
     unsigned char c;
@@ -646,31 +646,31 @@ int u8_isvalid(const char *str, size_t length)
         /* Check for overlong sequences for each different length */
         switch (ab) {
             /* Check for xx00 000x */
-        case 1:
-            if ((c & 0x3e) == 0) return 0;
-            continue;   /* We know there aren't any more bytes to check */
+            case 1:
+                if ((c & 0x3e) == 0) return 0;
+                continue;   /* We know there aren't any more bytes to check */
 
             /* Check for 1110 0000, xx0x xxxx */
-        case 2:
-            if (c == 0xe0 && (*p & 0x20) == 0) return 0;
-            break;
+            case 2:
+                if (c == 0xe0 && (*p & 0x20) == 0) return 0;
+                break;
 
             /* Check for 1111 0000, xx00 xxxx */
-        case 3:
-            if (c == 0xf0 && (*p & 0x30) == 0) return 0;
-            break;
+            case 3:
+                if (c == 0xf0 && (*p & 0x30) == 0) return 0;
+                break;
 
             /* Check for 1111 1000, xx00 0xxx */
-        case 4:
-            if (c == 0xf8 && (*p & 0x38) == 0) return 0;
-            break;
+            case 4:
+                if (c == 0xf8 && (*p & 0x38) == 0) return 0;
+                break;
 
             /* Check for leading 0xfe or 0xff,
                and then for 1111 1100, xx00 00xx */
-        case 5:
-            if (c == 0xfe || c == 0xff ||
-                (c == 0xfc && (*p & 0x3c) == 0)) return 0;
-            break;
+            case 5:
+                if (c == 0xfe || c == 0xff ||
+                      (c == 0xfc && (*p & 0x3c) == 0)) return 0;
+                break;
         }
 
         /* Check for valid bytes after the 2nd, if any; all must start 10 */
@@ -682,6 +682,65 @@ int u8_isvalid(const char *str, size_t length)
     return ret;
 }
 
+/* Rewritten completely, original code not based on anything else
+
+   length is in bytes, since without knowing whether the string is valid
+   it's hard to know how many characters there are! */
+int u8_isvalid(const char *iStr, size_t iLength)
+{
+    const unsigned char *pnt;   // Current pointer in string
+    const unsigned char *pend;  // End of string
+    unsigned char       byt;    // Current byte
+
+    // Empty strings can be considered valid ASCII
+    if (!iLength) return 1;
+    pnt = (unsigned char *)iStr;
+    pend = (unsigned char *)iStr + iLength;
+    // First scan for non-ASCII characters as fast as possible
+    do {
+        if (*pnt++ & 0x80) goto chkutf8;
+    } while (pnt < pend);
+    return 1;
+
+    // Check validity of UTF-8 sequences
+chkutf8:
+    if (pnt == pend) return 0;    // Last byte can't be > 127
+    byt = pnt[-1];
+    // Must be between 0xc2 and 0xf4 inclusive to be valid
+    if (((uint)byt - 0xc2) > (0xf4-0xc2)) return 0;
+    if (byt < 0xe0) {               // 2-byte sequence
+        // Must have valid continuation character
+        if ((*pnt++ & 0xc0) != 0x80) return 0;
+    } else if (byt < 0xf0) {        // 3-byte sequence
+        if ((pnt + 1 >= pend)
+              || (*pnt & 0xc0) != 0x80
+              || (pnt[1] & 0xc0) != 0x80)
+            return 0;
+        // Check for surrogate chars
+        if (byt == 0xed && *pnt > 0x9f) return 0;
+        pnt += 2;
+    } else {                        // 4-byte sequence
+        // Must have 3 valid continuation characters
+        if ((pnt + 2 >= pend)
+              || (*pnt & 0xc0) != 0x80
+              || (pnt[1] & 0xc0) != 0x80
+              || (pnt[2] & 0xc0) != 0x80)
+            return 0;
+        // Make sure in correct range (0x10000 - 0x10ffff)
+        if (byt == 0xf0) {
+            if (*pnt < 0x90) return 0;
+        } else if (byt == 0xf4) {
+            if (*pnt > 0x8f) return 0;
+        }
+        pnt += 3;
+    }
+    // Find next non-ASCII characters as fast as possible
+    while (pnt < pend) {
+        if (*pnt++ & 0x80) goto chkutf8;
+    }
+    return 2;   // Valid UTF-8
+}
+
 int u8_reverse(char *dest, char *src, size_t len)
 {
     size_t si=0, di=len;

diff --git a/test/strings.jl b/test/strings.jl
@@ -1030,9 +1030,10 @@ end
 let
     # make symbol with invalid char
     sym = symbol(Char(0xdcdb))
-    @test string(sym) == "\udcdb"
+    @test string(sym) == string(Char(0xdcdb))
     @test expand(sym) === sym
-    @test parse("\udcdb = 1",1,raise=false)[1] == Expr(:error, "invalid character \"\udcdb\"")
+    res = string(parse(string(Char(0xdcdb)," = 1"),1,raise=false)[1])
+    @test res == """\$(Expr(:error, "invalid character \\\"\\udcdb\\\"\"))"""
 end
 
 @test symbol("asdf") === :asdf
@@ -1284,10 +1285,10 @@ end
 @test is_valid_ascii("Σ_not_valid_ascii") == false
 @test is_valid_char('a') == true
 @test is_valid_char('\x00') == true
-@test is_valid_char('\ud800') == false
+@test is_valid_char(0xd800) == false
 
 @test is_valid_utf16(utf16("a")) == true
-@test is_valid_utf16(utf16("\ud800")) == false
+@test is_valid_utf16(UInt16[0xd800,0]) == false
 # TODO is_valid_utf8
 
 # Issue #11140