Skip to content

Commit

Permalink
Merge pull request #11203 from ScottPJones/spj/fixvalidutf8
Browse files Browse the repository at this point in the history
Fix #11141/#10973 and improve performance of is_valid_utf8/is_valid_ascii
  • Loading branch information
stevengj committed May 15, 2015
2 parents 888b6b4 + 14e289d commit 2f019d7
Show file tree
Hide file tree
Showing 2 changed files with 128 additions and 60 deletions.
104 changes: 48 additions & 56 deletions src/support/utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -615,71 +615,63 @@ size_t u8_printf(const char *fmt, ...)
return cnt;
}

/* based on the valid_utf8 routine from the PCRE library by Philip Hazel
/* Rewritten completely, original code not based on anything else
length is in bytes, since without knowing whether the string is valid
it's hard to know how many characters there are! */
int u8_isvalid(const char *str, size_t length)
int u8_isvalid(const char *str, size_t len)
{
const unsigned char *p, *pend = (unsigned char*)str + length;
unsigned char c;
int ret = 1; /* ASCII */
int ab;
const unsigned char *pnt; // Current pointer in string
const unsigned char *pend; // End of string
unsigned char byt; // Current byte

// Empty strings can be considered valid ASCII
if (!len) return 1;
pnt = (unsigned char *)str;
pend = (unsigned char *)str + len;
// First scan for non-ASCII characters as fast as possible
do {
if (*pnt++ & 0x80) goto chkutf8;
} while (pnt < pend);
return 1;

for (p = (unsigned char*)str; p < pend; p++) {
c = *p;
if (c < 128)
continue;
ret = 2; /* non-ASCII UTF-8 */
if ((c & 0xc0) != 0xc0)
// Check validity of UTF-8 sequences
chkutf8:
if (pnt == pend) return 0; // Last byte can't be > 127
byt = pnt[-1];
// Must be between 0xc2 and 0xf4 inclusive to be valid
if (((uint32_t)byt - 0xc2) > (0xf4-0xc2)) return 0;
if (byt < 0xe0) { // 2-byte sequence
// Must have valid continuation character
if ((*pnt++ & 0xc0) != 0x80) return 0;
} else if (byt < 0xf0) { // 3-byte sequence
if ((pnt + 1 >= pend)
|| (*pnt & 0xc0) != 0x80
|| (pnt[1] & 0xc0) != 0x80)
return 0;
ab = trailingBytesForUTF8[c];
if (length < ab)
return 0;
length -= ab;

p++;
/* Check top bits in the second byte */
if ((*p & 0xc0) != 0x80)
// Check for surrogate chars
if (byt == 0xed && *pnt > 0x9f) return 0;
pnt += 2;
} else { // 4-byte sequence
// Must have 3 valid continuation characters
if ((pnt + 2 >= pend)
|| (*pnt & 0xc0) != 0x80
|| (pnt[1] & 0xc0) != 0x80
|| (pnt[2] & 0xc0) != 0x80)
return 0;

/* Check for overlong sequences for each different length */
switch (ab) {
/* Check for xx00 000x */
case 1:
if ((c & 0x3e) == 0) return 0;
continue; /* We know there aren't any more bytes to check */

/* Check for 1110 0000, xx0x xxxx */
case 2:
if (c == 0xe0 && (*p & 0x20) == 0) return 0;
break;

/* Check for 1111 0000, xx00 xxxx */
case 3:
if (c == 0xf0 && (*p & 0x30) == 0) return 0;
break;

/* Check for 1111 1000, xx00 0xxx */
case 4:
if (c == 0xf8 && (*p & 0x38) == 0) return 0;
break;

/* Check for leading 0xfe or 0xff,
and then for 1111 1100, xx00 00xx */
case 5:
if (c == 0xfe || c == 0xff ||
(c == 0xfc && (*p & 0x3c) == 0)) return 0;
break;
}

/* Check for valid bytes after the 2nd, if any; all must start 10 */
while (--ab > 0) {
if ((*(++p) & 0xc0) != 0x80) return 0;
// Make sure in correct range (0x10000 - 0x10ffff)
if (byt == 0xf0) {
if (*pnt < 0x90) return 0;
} else if (byt == 0xf4) {
if (*pnt > 0x8f) return 0;
}
pnt += 3;
}

return ret;
// Find next non-ASCII characters as fast as possible
while (pnt < pend) {
if (*pnt++ & 0x80) goto chkutf8;
}
return 2; // Valid UTF-8
}

int u8_reverse(char *dest, char *src, size_t len)
Expand Down
84 changes: 80 additions & 4 deletions test/strings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1030,9 +1030,10 @@ end
let
# make symbol with invalid char
sym = symbol(Char(0xdcdb))
@test string(sym) == "\udcdb"
@test string(sym) == string(Char(0xdcdb))
@test expand(sym) === sym
@test parse("\udcdb = 1",1,raise=false)[1] == Expr(:error, "invalid character \"\udcdb\"")
res = string(parse(string(Char(0xdcdb)," = 1"),1,raise=false)[1])
@test res == """\$(Expr(:error, "invalid character \\\"\\udcdb\\\"\"))"""
end

@test symbol("asdf") === :asdf
Expand Down Expand Up @@ -1284,17 +1285,92 @@ end
@test is_valid_ascii("Σ_not_valid_ascii") == false
@test is_valid_char('a') == true
@test is_valid_char('\x00') == true
@test is_valid_char('\ud800') == false
@test is_valid_char(0xd800) == false

@test is_valid_utf16(utf16("a")) == true
@test is_valid_utf16(utf16("\ud800")) == false
@test is_valid_utf16(UInt16[0xd800,0]) == false
# TODO is_valid_utf8

# Issue #11140
@test is_valid_utf32(utf32("a")) == true
@test is_valid_utf32(utf32("\x00")) == true
@test is_valid_utf32(UInt32[0xd800,0]) == false

# Issue #11203
@test is_valid_ascii(UInt8[]) == true
@test is_valid_utf8(UInt8[]) == true
@test is_valid_utf16(UInt16[]) == true
@test is_valid_utf32(UInt32[]) == true

# Check UTF-8 characters
# Check ASCII range (true),
# then single continuation bytes and lead bytes with no following continuation bytes (false)
for (rng,flg) in ((0:0x7f, true), (0x80:0xff, false))
for byt in rng
@test is_valid_utf8(UInt8[byt]) == flg
end
end
# Check overlong lead bytes for 2-character sequences (false)
for byt = 0xc0:0xc1
@test is_valid_utf8(UInt8[byt,0x80]) == false
end
# Check valid lead-in to two-byte sequences (true)
for byt = 0xc2:0xdf
for (rng,flg) in ((0x00:0x7f, false), (0x80:0xbf, true), (0xc0:0xff, false))
for cont in rng
@test is_valid_utf8(UInt8[byt, cont]) == flg
end
end
end
# Check three-byte sequences
for r1 in (0xe0:0xec, 0xee:0xef)
for byt = r1
# Check for short sequence
@test is_valid_utf8(UInt8[byt]) == false
for (rng,flg) in ((0x00:0x7f, false), (0x80:0xbf, true), (0xc0:0xff, false))
for cont in rng
@test is_valid_utf8(UInt8[byt, cont]) == false
@test is_valid_utf8(UInt8[byt, cont, 0x80]) == flg
end
end
end
end
# Check hangul characters (0xd000-0xd7ff) hangul
# Check for short sequence, or start of surrogate pair
for (rng,flg) in ((0x00:0x7f, false), (0x80:0x9f, true), (0xa0:0xff, false))
for cont in rng
@test is_valid_utf8(UInt8[0xed, cont]) == false
@test is_valid_utf8(UInt8[0xed, cont, 0x80]) == flg
end
end
# Check valid four-byte sequences
for byt = 0xf0:0xf4
if (byt == 0xf0)
r0 = ((0x00:0x8f, false), (0x90:0xbf, true), (0xc0:0xff, false))
elseif byt == 0xf4
r0 = ((0x00:0x7f, false), (0x80:0x8f, true), (0x90:0xff, false))
else
r0 = ((0x00:0x7f, false), (0x80:0xbf, true), (0xc0:0xff, false))
end
for (rng,flg) in r0
for cont in rng
@test is_valid_utf8(UInt8[byt, cont]) == false
@test is_valid_utf8(UInt8[byt, cont, 0x80]) == false
@test is_valid_utf8(UInt8[byt, cont, 0x80, 0x80]) == flg
end
end
end
# Check five-byte sequences, should be invalid
for byt = 0xf8:0xfb
@test is_valid_utf8(UInt8[byt, 0x80, 0x80, 0x80, 0x80]) == false
end
# Check six-byte sequences, should be invalid
for byt = 0xfc:0xfd
@test is_valid_utf8(UInt8[byt, 0x80, 0x80, 0x80, 0x80, 0x80]) == false
end
# Check seven-byte sequences, should be invalid
@test is_valid_utf8(UInt8[0xfe, 0x80, 0x80, 0x80, 0x80, 0x80]) == false

# This caused JuliaLang/JSON.jl#82
@test first('\x00':'\x7f') === '\x00'
@test last('\x00':'\x7f') === '\x7f'
Expand Down

0 comments on commit 2f019d7

Please sign in to comment.