diff --git a/src/support/utf8.c b/src/support/utf8.c index 8642550172e53..f7224ad6ba293 100644 --- a/src/support/utf8.c +++ b/src/support/utf8.c @@ -619,7 +619,7 @@ size_t u8_printf(const char *fmt, ...) length is in bytes, since without knowing whether the string is valid it's hard to know how many characters there are! */ -int u8_isvalid(const char *str, size_t length) +DLLEXPORT int u8_isvalid_old(const char *str, size_t length) { const unsigned char *p, *pend = (unsigned char*)str + length; unsigned char c; @@ -646,31 +646,31 @@ int u8_isvalid(const char *str, size_t length) /* Check for overlong sequences for each different length */ switch (ab) { /* Check for xx00 000x */ - case 1: - if ((c & 0x3e) == 0) return 0; - continue; /* We know there aren't any more bytes to check */ + case 1: + if ((c & 0x3e) == 0) return 0; + continue; /* We know there aren't any more bytes to check */ /* Check for 1110 0000, xx0x xxxx */ - case 2: - if (c == 0xe0 && (*p & 0x20) == 0) return 0; - break; + case 2: + if (c == 0xe0 && (*p & 0x20) == 0) return 0; + break; /* Check for 1111 0000, xx00 xxxx */ - case 3: - if (c == 0xf0 && (*p & 0x30) == 0) return 0; - break; + case 3: + if (c == 0xf0 && (*p & 0x30) == 0) return 0; + break; /* Check for 1111 1000, xx00 0xxx */ - case 4: - if (c == 0xf8 && (*p & 0x38) == 0) return 0; - break; + case 4: + if (c == 0xf8 && (*p & 0x38) == 0) return 0; + break; /* Check for leading 0xfe or 0xff, and then for 1111 1100, xx00 00xx */ - case 5: - if (c == 0xfe || c == 0xff || - (c == 0xfc && (*p & 0x3c) == 0)) return 0; - break; + case 5: + if (c == 0xfe || c == 0xff || + (c == 0xfc && (*p & 0x3c) == 0)) return 0; + break; } /* Check for valid bytes after the 2nd, if any; all must start 10 */ @@ -682,6 +682,65 @@ int u8_isvalid(const char *str, size_t length) return ret; } +/* Rewritten completely, original code not based on anything else + + length is in bytes, since without knowing whether the string is valid + it's hard to know how many characters there are! */ +int u8_isvalid(const char *iStr, size_t iLength) +{ + const unsigned char *pnt; // Current pointer in string + const unsigned char *pend; // End of string + unsigned char byt; // Current byte + + // Empty strings can be considered valid ASCII + if (!iLength) return 1; + pnt = (unsigned char *)iStr; + pend = (unsigned char *)iStr + iLength; + // First scan for non-ASCII characters as fast as possible + do { + if (*pnt++ & 0x80) goto chkutf8; + } while (pnt < pend); + return 1; + + // Check validity of UTF-8 sequences +chkutf8: + if (pnt == pend) return 0; // Last byte can't be > 127 + byt = pnt[-1]; + // Must be between 0xc2 and 0xf4 inclusive to be valid + if (((uint)byt - 0xc2) > (0xf4-0xc2)) return 0; + if (byt < 0xe0) { // 2-byte sequence + // Must have valid continuation character + if ((*pnt++ & 0xc0) != 0x80) return 0; + } else if (byt < 0xf0) { // 3-byte sequence + if ((pnt + 1 >= pend) + || (*pnt & 0xc0) != 0x80 + || (pnt[1] & 0xc0) != 0x80) + return 0; + // Check for surrogate chars + if (byt == 0xed && *pnt > 0x9f) return 0; + pnt += 2; + } else { // 4-byte sequence + // Must have 3 valid continuation characters + if ((pnt + 2 >= pend) + || (*pnt & 0xc0) != 0x80 + || (pnt[1] & 0xc0) != 0x80 + || (pnt[2] & 0xc0) != 0x80) + return 0; + // Make sure in correct range (0x10000 - 0x10ffff) + if (byt == 0xf0) { + if (*pnt < 0x90) return 0; + } else if (byt == 0xf4) { + if (*pnt > 0x8f) return 0; + } + pnt += 3; + } + // Find next non-ASCII characters as fast as possible + while (pnt < pend) { + if (*pnt++ & 0x80) goto chkutf8; + } + return 2; // Valid UTF-8 +} + int u8_reverse(char *dest, char *src, size_t len) { size_t si=0, di=len; diff --git a/test/strings.jl b/test/strings.jl index 57e232c736092..3dfda62ecf8c2 100644 --- a/test/strings.jl +++ b/test/strings.jl @@ -1030,9 +1030,10 @@ end let # make symbol with invalid char sym = symbol(Char(0xdcdb)) - @test string(sym) == "\udcdb" + @test string(sym) == string(Char(0xdcdb)) @test expand(sym) === sym - @test parse("\udcdb = 1",1,raise=false)[1] == Expr(:error, "invalid character \"\udcdb\"") + res = string(parse(string(Char(0xdcdb)," = 1"),1,raise=false)[1]) + @test res == """\$(Expr(:error, "invalid character \\\"\\udcdb\\\"\"))""" end @test symbol("asdf") === :asdf @@ -1284,10 +1285,10 @@ end @test is_valid_ascii("Σ_not_valid_ascii") == false @test is_valid_char('a') == true @test is_valid_char('\x00') == true -@test is_valid_char('\ud800') == false +@test is_valid_char(0xd800) == false @test is_valid_utf16(utf16("a")) == true -@test is_valid_utf16(utf16("\ud800")) == false +@test is_valid_utf16(UInt16[0xd800,0]) == false # TODO is_valid_utf8 # Issue #11140