Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix #11141/#10973 and improve performance of is_valid_utf8/is_valid_ascii #11203

Merged
merged 3 commits into from
May 15, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 48 additions & 56 deletions src/support/utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -615,71 +615,63 @@ size_t u8_printf(const char *fmt, ...)
return cnt;
}

/* based on the valid_utf8 routine from the PCRE library by Philip Hazel
/* Rewritten completely, original code not based on anything else

length is in bytes, since without knowing whether the string is valid
it's hard to know how many characters there are! */
int u8_isvalid(const char *str, size_t length)
int u8_isvalid(const char *str, size_t len)
{
const unsigned char *p, *pend = (unsigned char*)str + length;
unsigned char c;
int ret = 1; /* ASCII */
int ab;
const unsigned char *pnt; // Current pointer in string
const unsigned char *pend; // End of string
unsigned char byt; // Current byte

// Empty strings can be considered valid ASCII
if (!len) return 1;
pnt = (unsigned char *)str;
pend = (unsigned char *)str + len;
// First scan for non-ASCII characters as fast as possible
do {
if (*pnt++ & 0x80) goto chkutf8;
} while (pnt < pend);
return 1;

for (p = (unsigned char*)str; p < pend; p++) {
c = *p;
if (c < 128)
continue;
ret = 2; /* non-ASCII UTF-8 */
if ((c & 0xc0) != 0xc0)
// Check validity of UTF-8 sequences
chkutf8:
if (pnt == pend) return 0; // Last byte can't be > 127
byt = pnt[-1];
// Must be between 0xc2 and 0xf4 inclusive to be valid
if (((uint32_t)byt - 0xc2) > (0xf4-0xc2)) return 0;
if (byt < 0xe0) { // 2-byte sequence
// Must have valid continuation character
if ((*pnt++ & 0xc0) != 0x80) return 0;
} else if (byt < 0xf0) { // 3-byte sequence
if ((pnt + 1 >= pend)
|| (*pnt & 0xc0) != 0x80
|| (pnt[1] & 0xc0) != 0x80)
return 0;
ab = trailingBytesForUTF8[c];
if (length < ab)
return 0;
length -= ab;

p++;
/* Check top bits in the second byte */
if ((*p & 0xc0) != 0x80)
// Check for surrogate chars
if (byt == 0xed && *pnt > 0x9f) return 0;
pnt += 2;
} else { // 4-byte sequence
// Must have 3 valid continuation characters
if ((pnt + 2 >= pend)
|| (*pnt & 0xc0) != 0x80
|| (pnt[1] & 0xc0) != 0x80
|| (pnt[2] & 0xc0) != 0x80)
return 0;

/* Check for overlong sequences for each different length */
switch (ab) {
/* Check for xx00 000x */
case 1:
if ((c & 0x3e) == 0) return 0;
continue; /* We know there aren't any more bytes to check */

/* Check for 1110 0000, xx0x xxxx */
case 2:
if (c == 0xe0 && (*p & 0x20) == 0) return 0;
break;

/* Check for 1111 0000, xx00 xxxx */
case 3:
if (c == 0xf0 && (*p & 0x30) == 0) return 0;
break;

/* Check for 1111 1000, xx00 0xxx */
case 4:
if (c == 0xf8 && (*p & 0x38) == 0) return 0;
break;

/* Check for leading 0xfe or 0xff,
and then for 1111 1100, xx00 00xx */
case 5:
if (c == 0xfe || c == 0xff ||
(c == 0xfc && (*p & 0x3c) == 0)) return 0;
break;
}

/* Check for valid bytes after the 2nd, if any; all must start 10 */
while (--ab > 0) {
if ((*(++p) & 0xc0) != 0x80) return 0;
// Make sure in correct range (0x10000 - 0x10ffff)
if (byt == 0xf0) {
if (*pnt < 0x90) return 0;
} else if (byt == 0xf4) {
if (*pnt > 0x8f) return 0;
}
pnt += 3;
}

return ret;
// Find next non-ASCII characters as fast as possible
while (pnt < pend) {
if (*pnt++ & 0x80) goto chkutf8;
}
return 2; // Valid UTF-8
}

int u8_reverse(char *dest, char *src, size_t len)
Expand Down
84 changes: 80 additions & 4 deletions test/strings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1030,9 +1030,10 @@ end
let
# make symbol with invalid char
sym = symbol(Char(0xdcdb))
@test string(sym) == "\udcdb"
@test string(sym) == string(Char(0xdcdb))
@test expand(sym) === sym
@test parse("\udcdb = 1",1,raise=false)[1] == Expr(:error, "invalid character \"\udcdb\"")
res = string(parse(string(Char(0xdcdb)," = 1"),1,raise=false)[1])
@test res == """\$(Expr(:error, "invalid character \\\"\\udcdb\\\"\"))"""
end

@test symbol("asdf") === :asdf
Expand Down Expand Up @@ -1284,17 +1285,92 @@ end
@test is_valid_ascii("Σ_not_valid_ascii") == false
@test is_valid_char('a') == true
@test is_valid_char('\x00') == true
@test is_valid_char('\ud800') == false
@test is_valid_char(0xd800) == false

@test is_valid_utf16(utf16("a")) == true
@test is_valid_utf16(utf16("\ud800")) == false
@test is_valid_utf16(UInt16[0xd800,0]) == false
# TODO is_valid_utf8

# Issue #11140
@test is_valid_utf32(utf32("a")) == true
@test is_valid_utf32(utf32("\x00")) == true
@test is_valid_utf32(UInt32[0xd800,0]) == false

# Issue #11203
@test is_valid_ascii(UInt8[]) == true
@test is_valid_utf8(UInt8[]) == true
@test is_valid_utf16(UInt16[]) == true
@test is_valid_utf32(UInt32[]) == true

# Check UTF-8 characters
# Check ASCII range (true),
# then single continuation bytes and lead bytes with no following continuation bytes (false)
for (rng,flg) in ((0:0x7f, true), (0x80:0xff, false))
for byt in rng
@test is_valid_utf8(UInt8[byt]) == flg
end
end
# Check overlong lead bytes for 2-character sequences (false)
for byt = 0xc0:0xc1
@test is_valid_utf8(UInt8[byt,0x80]) == false
end
# Check valid lead-in to two-byte sequences (true)
for byt = 0xc2:0xdf
for (rng,flg) in ((0x00:0x7f, false), (0x80:0xbf, true), (0xc0:0xff, false))
for cont in rng
@test is_valid_utf8(UInt8[byt, cont]) == flg
end
end
end
# Check three-byte sequences
for r1 in (0xe0:0xec, 0xee:0xef)
for byt = r1
# Check for short sequence
@test is_valid_utf8(UInt8[byt]) == false
for (rng,flg) in ((0x00:0x7f, false), (0x80:0xbf, true), (0xc0:0xff, false))
for cont in rng
@test is_valid_utf8(UInt8[byt, cont]) == false
@test is_valid_utf8(UInt8[byt, cont, 0x80]) == flg
end
end
end
end
# Check hangul characters (0xd000-0xd7ff) hangul
# Check for short sequence, or start of surrogate pair
for (rng,flg) in ((0x00:0x7f, false), (0x80:0x9f, true), (0xa0:0xff, false))
for cont in rng
@test is_valid_utf8(UInt8[0xed, cont]) == false
@test is_valid_utf8(UInt8[0xed, cont, 0x80]) == flg
end
end
# Check valid four-byte sequences
for byt = 0xf0:0xf4
if (byt == 0xf0)
r0 = ((0x00:0x8f, false), (0x90:0xbf, true), (0xc0:0xff, false))
elseif byt == 0xf4
r0 = ((0x00:0x7f, false), (0x80:0x8f, true), (0x90:0xff, false))
else
r0 = ((0x00:0x7f, false), (0x80:0xbf, true), (0xc0:0xff, false))
end
for (rng,flg) in r0
for cont in rng
@test is_valid_utf8(UInt8[byt, cont]) == false
@test is_valid_utf8(UInt8[byt, cont, 0x80]) == false
@test is_valid_utf8(UInt8[byt, cont, 0x80, 0x80]) == flg
end
end
end
# Check five-byte sequences, should be invalid
for byt = 0xf8:0xfb
@test is_valid_utf8(UInt8[byt, 0x80, 0x80, 0x80, 0x80]) == false
end
# Check six-byte sequences, should be invalid
for byt = 0xfc:0xfd
@test is_valid_utf8(UInt8[byt, 0x80, 0x80, 0x80, 0x80, 0x80]) == false
end
# Check seven-byte sequences, should be invalid
@test is_valid_utf8(UInt8[0xfe, 0x80, 0x80, 0x80, 0x80, 0x80]) == false

# This caused JuliaLang/JSON.jl#82
@test first('\x00':'\x7f') === '\x00'
@test last('\x00':'\x7f') === '\x7f'
Expand Down