-
-
Notifications
You must be signed in to change notification settings - Fork 5.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve Unicode related error messages #11573
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,5 @@ | ||
# This file is a part of Julia. License is MIT: http://julialang.org/license | ||
|
||
immutable UTF16String <: AbstractString | ||
data::Array{UInt16,1} # includes 16-bit NULL termination after string chars | ||
function UTF16String(data::Vector{UInt16}) | ||
if length(data) < 1 || data[end] != 0 | ||
throw(ArgumentError("UTF16String data must be NULL-terminated")) | ||
end | ||
new(data) | ||
end | ||
end | ||
|
||
utf16_is_lead(c::UInt16) = (c & 0xfc00) == 0xd800 | ||
utf16_is_trail(c::UInt16) = (c & 0xfc00) == 0xdc00 | ||
utf16_is_surrogate(c::UInt16) = (c & 0xf800) == 0xd800 | ||
|
@@ -39,7 +29,7 @@ function next(s::UTF16String, i::Int) | |
elseif length(s.data)-1 > i && utf16_is_lead(s.data[i]) && utf16_is_trail(s.data[i+1]) | ||
return utf16_get_supplementary(s.data[i], s.data[i+1]), i+2 | ||
end | ||
throw(ArgumentError("invalid UTF-16 character index")) | ||
throw(UnicodeError(UTF_ERR_INVALID_INDEX,0,0)) | ||
end | ||
|
||
function reverseind(s::UTF16String, i::Integer) | ||
|
@@ -74,7 +64,7 @@ function encode16(s::AbstractString) | |
push!(buf, UInt16(0xd7c0 + (c>>10))) | ||
push!(buf, UInt16(0xdc00 + (c & 0x3ff))) | ||
else | ||
throw(ArgumentError("invalid Unicode character (0x$(hex(c)) > 0x10ffff)")) | ||
throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, ch)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This one's actually less information because of losing the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well, that was one that I added myself (my very first PR). I can put it back in. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you put in the extra info and don't mind taking it out, sounds fine by me. |
||
end | ||
end | ||
push!(buf, 0) # NULL termination | ||
|
@@ -111,7 +101,7 @@ function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16}) | |
end | ||
|
||
function convert(::Type{UTF16String}, data::AbstractVector{UInt16}) | ||
!isvalid(UTF16String, data) && throw(ArgumentError("invalid UTF16 data")) | ||
!isvalid(UTF16String, data) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0)) | ||
len = length(data) | ||
d = Array(UInt16, len + 1) | ||
d[end] = 0 # NULL terminate | ||
|
@@ -126,7 +116,7 @@ convert(T::Type{UTF16String}, data::AbstractArray{Int16}) = | |
|
||
function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8}) | ||
isempty(bytes) && return UTF16String(UInt16[0]) | ||
isodd(length(bytes)) && throw(ArgumentError("odd number of bytes")) | ||
isodd(length(bytes)) && throw(UnicodeError(UTF_ERR_ODD_BYTES_16, length(bytes), 0)) | ||
data = reinterpret(UInt16, bytes) | ||
# check for byte-order mark (BOM): | ||
if data[1] == 0xfeff # native byte order | ||
|
@@ -142,7 +132,7 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8}) | |
copy!(d,1, data,1, length(data)) # assume native byte order | ||
end | ||
d[end] = 0 # NULL terminate | ||
!isvalid(UTF16String, d) && throw(ArgumentError("invalid UTF16 data")) | ||
!isvalid(UTF16String, d) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0)) | ||
UTF16String(d) | ||
end | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -72,7 +72,7 @@ function next(s::UTF8String, i::Int) | |
end | ||
if 0 < j && i <= j+utf8_trailing[d[j]+1] <= length(d) | ||
# b is a continuation byte of a valid UTF-8 character | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
nevermind |
||
throw(ArgumentError("invalid UTF-8 character index")) | ||
throw(UnicodeError(UTF_ERR_CONT, i, d[j])) | ||
end | ||
# move past 1 byte in case the data is actually Latin-1 | ||
return '\ufffd', i+1 | ||
|
@@ -198,7 +198,7 @@ function reverse(s::UTF8String) | |
out = similar(s.data) | ||
if ccall(:u8_reverse, Cint, (Ptr{UInt8}, Ptr{UInt8}, Csize_t), | ||
out, s.data, length(out)) == 1 | ||
throw(ArgumentError("invalid UTF-8 data")) | ||
throw(UnicodeError(UTF_ERR_INVALID_8,0,0)) | ||
end | ||
UTF8String(out) | ||
end | ||
|
@@ -212,7 +212,7 @@ write(io::IO, s::UTF8String) = write(io, s.data) | |
utf8(x) = convert(UTF8String, x) | ||
convert(::Type{UTF8String}, s::UTF8String) = s | ||
convert(::Type{UTF8String}, s::ASCIIString) = UTF8String(s.data) | ||
convert(::Type{UTF8String}, a::Array{UInt8,1}) = isvalid(UTF8String, a) ? UTF8String(a) : throw(ArgumentError("invalid UTF-8 sequence")) | ||
convert(::Type{UTF8String}, a::Array{UInt8,1}) = isvalid(UTF8String, a) ? UTF8String(a) : throw(UnicodeError(UTF_ERR_INVALID_8)) | ||
function convert(::Type{UTF8String}, a::Array{UInt8,1}, invalids_as::AbstractString) | ||
l = length(a) | ||
idx = 1 | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
# This file is a part of Julia. License is MIT: http://julialang.org/license | ||
|
||
##\brief Error messages for Unicode / UTF support | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see what you did there. At least the backslash is less confusing here than There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I see the point about looking like a macro... that was done because in Python, for a docstring, you must use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Comments are awesome. Extra annotations in the comments that don't do anything yet and don't mean anything to the majority of us who've probably never used doxygen are just noise. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, but to me, they help a lot when working on the code, even without the very nice stuff that doxygen builds from them... (have you ever tried doxygen on a very large project? It really is helpful...) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Okay, but you're contributing to a pre-existing large project that doesn't use doxygen (yet, maybe that will change but that's not for this PR to worry about), and at least 2 core contributors have told you to knock it off with the doxygen annotations. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @one-more-minute You've convinced me... (as a matter of fact, I've worked for a while doing just that, pulling information out of unstructured data... I should have thought of that!). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Well I've been evaluating by spending more time than I'd like reviewing your PR's, and so far I don't like the doxygen markup. But I don't care enough to let it block this PR which is otherwise worth merging, so I'm just going to do that now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And I was just going to change this to using something more along the lines of Mike's idea... 😀 I guess I can do that in a later PR, as soon as the dust as settled a bit on the parsing Markdown ideas... Thanks! (and thanks for taking the time, and pushing back on the doxygen format, because now I will push for something better for documentation, which will still allow me to generate output for doxygen, but which fits better with Julia, and which I hope you do like) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Sure, I think it's completely reasonable to settle on some conventions eventually (these things usually happen naturally anyway). I think @catawbasam is right about side issues, though, so let's keep that discussion focussed in #8966 so that things can move forward here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @tkelman and I've been spending more time than I'd like changing things so frequently 😀 but nobody has been holding a gun to either of our heads to make you review, or me to respond to your reviews. A long process yes, and I'm sorry for the time it has taken, but I think in the end, the final result for julia is much better. Thanks again for your time. |
||
|
||
const UTF_ERR_SHORT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)" | ||
const UTF_ERR_CONT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)" | ||
const UTF_ERR_LONG = "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)" | ||
const UTF_ERR_NOT_LEAD = "not a leading Unicode surrogate code unit at index <<1>> (0x<<2>>)" | ||
const UTF_ERR_NOT_TRAIL = "not a trailing Unicode surrogate code unit at index <<1>> (0x<<2>>)" | ||
const UTF_ERR_NOT_SURROGATE = "not a valid Unicode surrogate code unit at index <<1>> (0x<<2>>" | ||
const UTF_ERR_MISSING_SURROGATE = "missing trailing Unicode surrogate code unit after index <<1>> (0x<<2>>)" | ||
const UTF_ERR_INVALID = "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)" | ||
const UTF_ERR_SURROGATE = "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)" | ||
const UTF_ERR_NULL_16_TERMINATE = "UTF16String data must be NULL-terminated" | ||
const UTF_ERR_NULL_32_TERMINATE = "UTF32String data must be NULL-terminated" | ||
const UTF_ERR_ODD_BYTES_16 = "UTF16String can't have odd number of bytes <<1>>" | ||
const UTF_ERR_ODD_BYTES_32 = "UTF32String must have multiple of 4 bytes <<1>>" | ||
const UTF_ERR_INVALID_CHAR = "invalid Unicode character (0x<<2>> > 0x10ffff)" | ||
const UTF_ERR_INVALID_8 = "invalid UTF-8 data" | ||
const UTF_ERR_INVALID_16 = "invalid UTF-16 data" | ||
const UTF_ERR_INVALID_INDEX = "invalid character index" | ||
const UTF_ERR_MAP_CHAR = "map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead" | ||
|
||
type UnicodeError <: Exception | ||
errmsg::AbstractString ##< A UTF_ERR_ message | ||
errpos::Int32 ##< Position of invalid character | ||
errchr::UInt32 ##< Invalid character | ||
end | ||
|
||
show(io::IO, exc::UnicodeError) = print(io, replace(replace(exc.errmsg,"<<1>>",string(exc.errpos)),"<<2>>",hex(exc.errchr))) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# This file is a part of Julia. License is MIT: http://julialang.org/license | ||
|
||
##\brief Base UTF16String type, has 16-bit NULL termination word after data, native byte order | ||
# | ||
# \throws UnicodeError | ||
|
||
immutable UTF16String <: AbstractString | ||
data::Vector{UInt16} # includes 16-bit NULL termination after string chars | ||
function UTF16String(data::Vector{UInt16}) | ||
if length(data) < 1 || data[end] != 0 | ||
throw(UnicodeError(UTF_ERR_NULL_16_TERMINATE, 0, 0)) | ||
end | ||
new(data) | ||
end | ||
end | ||
|
||
##\brief Base UTF32String type, has 32-bit NULL termination word after data, native byte order | ||
# | ||
# \throws UnicodeError | ||
|
||
immutable UTF32String <: DirectIndexString | ||
data::Vector{Char} # includes 32-bit NULL termination after string chars | ||
|
||
function UTF32String(data::Vector{Char}) | ||
if length(data) < 1 || data[end] != Char(0) | ||
throw(UnicodeError(UTF_ERR_NULL_32_TERMINATE, 0, 0)) | ||
end | ||
new(data) | ||
end | ||
end | ||
UTF32String(data::Vector{UInt32}) = UTF32String(reinterpret(Char, data)) | ||
|
||
isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(str::T) = isvalid(T, str.data) | ||
isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(::Type{T}, str::T) = isvalid(T, str.data) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess the GC frame concern still apply here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@yuyichao Please get that fixed! 😀
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've actually got a new idea this morning and would like to try out. (Sth like a
throw
local noinline). Maybe that can bring down the gap between #11284 and #11508 .