Skip to content

Commit

Permalink
Fix JuliaLang#11141 and improve performance greatly
Browse files Browse the repository at this point in the history
  • Loading branch information
ScottPJones committed May 8, 2015
1 parent dfc2cea commit 88a8071
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 21 deletions.
93 changes: 76 additions & 17 deletions src/support/utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -619,7 +619,7 @@ size_t u8_printf(const char *fmt, ...)
length is in bytes, since without knowing whether the string is valid
it's hard to know how many characters there are! */
int u8_isvalid(const char *str, size_t length)
DLLEXPORT int u8_isvalid_old(const char *str, size_t length)
{
const unsigned char *p, *pend = (unsigned char*)str + length;
unsigned char c;
Expand All @@ -646,31 +646,31 @@ int u8_isvalid(const char *str, size_t length)
/* Check for overlong sequences for each different length */
switch (ab) {
/* Check for xx00 000x */
case 1:
if ((c & 0x3e) == 0) return 0;
continue; /* We know there aren't any more bytes to check */
case 1:
if ((c & 0x3e) == 0) return 0;
continue; /* We know there aren't any more bytes to check */

/* Check for 1110 0000, xx0x xxxx */
case 2:
if (c == 0xe0 && (*p & 0x20) == 0) return 0;
break;
case 2:
if (c == 0xe0 && (*p & 0x20) == 0) return 0;
break;

/* Check for 1111 0000, xx00 xxxx */
case 3:
if (c == 0xf0 && (*p & 0x30) == 0) return 0;
break;
case 3:
if (c == 0xf0 && (*p & 0x30) == 0) return 0;
break;

/* Check for 1111 1000, xx00 0xxx */
case 4:
if (c == 0xf8 && (*p & 0x38) == 0) return 0;
break;
case 4:
if (c == 0xf8 && (*p & 0x38) == 0) return 0;
break;

/* Check for leading 0xfe or 0xff,
and then for 1111 1100, xx00 00xx */
case 5:
if (c == 0xfe || c == 0xff ||
(c == 0xfc && (*p & 0x3c) == 0)) return 0;
break;
case 5:
if (c == 0xfe || c == 0xff ||
(c == 0xfc && (*p & 0x3c) == 0)) return 0;
break;
}

/* Check for valid bytes after the 2nd, if any; all must start 10 */
Expand All @@ -682,6 +682,65 @@ int u8_isvalid(const char *str, size_t length)
return ret;
}

/* Rewritten completely, original code not based on anything else
length is in bytes, since without knowing whether the string is valid
it's hard to know how many characters there are! */
int u8_isvalid(const char *iStr, size_t iLength)
{
const unsigned char *pnt; // Current pointer in string
const unsigned char *pend; // End of string
unsigned char byt; // Current byte

// Empty strings can be considered valid ASCII
if (!iLength) return 1;
pnt = (unsigned char *)iStr;
pend = (unsigned char *)iStr + iLength;
// First scan for non-ASCII characters as fast as possible
do {
if (*pnt++ & 0x80) goto chkutf8;
} while (pnt < pend);
return 1;

// Check validity of UTF-8 sequences
chkutf8:
if (pnt == pend) return 0; // Last byte can't be > 127
byt = pnt[-1];
// Must be between 0xc2 and 0xf4 inclusive to be valid
if (((uint)byt - 0xc2) > (0xf4-0xc2)) return 0;
if (byt < 0xe0) { // 2-byte sequence
// Must have valid continuation character
if ((*pnt++ & 0xc0) != 0x80) return 0;
} else if (byt < 0xf0) { // 3-byte sequence
if ((pnt + 1 >= pend)
|| (*pnt & 0xc0) != 0x80
|| (pnt[1] & 0xc0) != 0x80)
return 0;
// Check for surrogate chars
if (byt == 0xed && *pnt > 0x9f) return 0;
pnt += 2;
} else { // 4-byte sequence
// Must have 3 valid continuation characters
if ((pnt + 2 >= pend)
|| (*pnt & 0xc0) != 0x80
|| (pnt[1] & 0xc0) != 0x80
|| (pnt[2] & 0xc0) != 0x80)
return 0;
// Make sure in correct range (0x10000 - 0x10ffff)
if (byt == 0xf0) {
if (*pnt < 0x90) return 0;
} else if (byt == 0xf4) {
if (*pnt > 0x8f) return 0;
}
pnt += 3;
}
// Find next non-ASCII characters as fast as possible
while (pnt < pend) {
if (*pnt++ & 0x80) goto chkutf8;
}
return 2; // Valid UTF-8
}

int u8_reverse(char *dest, char *src, size_t len)
{
size_t si=0, di=len;
Expand Down
9 changes: 5 additions & 4 deletions test/strings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1030,9 +1030,10 @@ end
let
# make symbol with invalid char
sym = symbol(Char(0xdcdb))
@test string(sym) == "\udcdb"
@test string(sym) == string(Char(0xdcdb))
@test expand(sym) === sym
@test parse("\udcdb = 1",1,raise=false)[1] == Expr(:error, "invalid character \"\udcdb\"")
res = string(parse(string(Char(0xdcdb)," = 1"),1,raise=false)[1])
@test res == """\$(Expr(:error, "invalid character \\\"\\udcdb\\\"\"))"""
end

@test symbol("asdf") === :asdf
Expand Down Expand Up @@ -1284,10 +1285,10 @@ end
@test is_valid_ascii("Σ_not_valid_ascii") == false
@test is_valid_char('a') == true
@test is_valid_char('\x00') == true
@test is_valid_char('\ud800') == false
@test is_valid_char(0xd800) == false

@test is_valid_utf16(utf16("a")) == true
@test is_valid_utf16(utf16("\ud800")) == false
@test is_valid_utf16(UInt16[0xd800,0]) == false
# TODO is_valid_utf8

# Issue #11140
Expand Down

0 comments on commit 88a8071

Please sign in to comment.