Skip to content

Commit

Permalink
Fix JuliaLang#10959 bugs with UTF-16 conversions
Browse files Browse the repository at this point in the history
Rewrote a number of the conversions between ASCIIString, UTF8String, and UTF16String.
Rewrote length() for UTF16String().
Improved reverse() for UTF16String().

Added over 150 lines of testing code to detect the above conversion problems

Added (in a gist) code to show other conversion problems not yet fixed:
https://gist.github.com/ScottPJones/4e6e8938f0559998f9fc

Added (in a gist) code to benchmark the performance, to ensure that adding the extra validity
checking did not adversely affect performance (in fact, performance was greatly improved).
https://gist.github.com/ScottPJones/79ed895f05f85f333d84

Updated based on review comments

Changes to error handling and check_string

Rebased against JuliaLang#11575
Updated comment to go before function, not indented by 4

Updated to use unsafe_checkstring

Removed redundant argument documentation
  • Loading branch information
ScottPJones committed Jun 25, 2015
1 parent 08fdb0a commit a286ce0
Show file tree
Hide file tree
Showing 4 changed files with 369 additions and 66 deletions.
282 changes: 225 additions & 57 deletions base/utf16.jl
Original file line number Diff line number Diff line change
@@ -1,17 +1,42 @@
# This file is a part of Julia. License is MIT: http://julialang.org/license

utf16_is_lead(c::UInt16) = (c & 0xfc00) == 0xd800
utf16_is_trail(c::UInt16) = (c & 0xfc00) == 0xdc00
utf16_is_surrogate(c::UInt16) = (c & 0xf800) == 0xd800
utf16_get_supplementary(lead::UInt16, trail::UInt16) = Char(UInt32(lead-0xd7f7)<<10 + trail)
# Quickly copy and set trailing \0
@inline function fast_utf_copy{S <: Union{UTF16String, UTF32String}, T <: Union{UInt16, Char}}(
::Type{S}, ::Type{T}, len, dat, flag::Bool=false)
S(setindex!(copy!(Vector{T}(len+1), 1, dat, 1, flag ? len : len+1), 0, len+1))
end

# Get rest of character ch from 3-byte UTF-8 sequence in dat
@inline function get_utf8_3byte(dat, pos, ch)
@inbounds return ((ch & 0xf) << 12) | (UInt32(dat[pos-1] & 0x3f) << 6) | (dat[pos] & 0x3f)
end
# Get rest of character ch from 4-byte UTF-8 sequence in dat
@inline function get_utf8_4byte(dat, pos, ch)
@inbounds return (((ch & 0x7) << 18)
| (UInt32(dat[pos-2] & 0x3f) << 12)
| (UInt32(dat[pos-1] & 0x3f) << 6)
| (dat[pos] & 0x3f))
end

# Output a character as a 4-byte UTF-8 sequence
@inline function output_utf8_4byte!(buf, out, ch)
@inbounds begin
buf[out + 1] = 0xf0 | (ch >>> 18)
buf[out + 2] = 0x80 | ((ch >>> 12) & 0x3f)
buf[out + 3] = 0x80 | ((ch >>> 6) & 0x3f)
buf[out + 4] = 0x80 | (ch & 0x3f)
end
end

const empty_utf16 = UTF16String(UInt16[0])

function length(s::UTF16String)
d = s.data
len = length(d) - 1
len == 0 && return 0
cnum = 0
for i = 1:len
@inbounds cnum += !utf16_is_trail(d[i])
@inbounds cnum += !is_surrogate_trail(d[i])
end
cnum
end
Expand All @@ -20,100 +45,240 @@ function endof(s::UTF16String)
d = s.data
i = length(d) - 1
i == 0 && return i
utf16_is_surrogate(d[i]) ? i-1 : i
return is_surrogate_codeunit(d[i]) ? i-1 : i
end

get_supplementary(lead::Unsigned, trail::Unsigned) = (UInt32(lead-0xd7f7)<<10 + trail)

function next(s::UTF16String, i::Int)
if !utf16_is_surrogate(s.data[i])
return Char(s.data[i]), i+1
elseif length(s.data)-1 > i && utf16_is_lead(s.data[i]) && utf16_is_trail(s.data[i+1])
return utf16_get_supplementary(s.data[i], s.data[i+1]), i+2
end
throw(UnicodeError(UTF_ERR_INVALID_INDEX,0,0))
ch = s.data[i]
!is_surrogate_codeunit(ch) && return (Char(ch), i+1)
# check length, account for terminating \0
i >= (length(s.data)-1) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, i, UInt32(ch)))
!is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, i, ch))
ct = s.data[i+1]
!is_surrogate_trail(ct) && throw((UTF_ERR_NOT_TRAIL, i, ch))
Char(get_supplementary(ch, ct)), i+2
end

function reverseind(s::UTF16String, i::Integer)
j = length(s.data) - i
return Base.utf16_is_trail(s.data[j]) ? j-1 : j
return is_surrogate_trail(s.data[j]) ? j-1 : j
end

lastidx(s::UTF16String) = length(s.data) - 1 # s.data includes NULL terminator

function reverse(s::UTF16String)
d =s.data
d = s.data
out = similar(d)
out[end] = 0 # NULL termination
n = length(d)
for i = 1:n-1
out[i] = d[n-i]
if Base.utf16_is_lead(out[i])
out[i],out[i-1] = out[i-1],out[i]
@inbounds for i = 1:n-1
ch = d[n-i]
if is_surrogate_lead(ch)
out[i],out[i-1] = out[i-1],ch
else
out[i] = ch
end
end
return UTF16String(out)
UTF16String(out)
end

# TODO: optimize this
function encode16(s::AbstractString)
buf = UInt16[]
for ch in s
c = reinterpret(UInt32, ch)
sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16)

function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
i = 1
n = length(data) # this may include NULL termination; that's okay
@inbounds while i < n # check for unpaired surrogates
if is_surrogate_lead(data[i]) && is_surrogate_trail(data[i+1])
i += 2
elseif is_surrogate_codeunit(data[i])
return false
else
i += 1
end
end
return i > n || !is_surrogate_codeunit(data[i])
end

"
Converts an `AbstractString` to a `UTF16String`
### Returns:
* `UTF16String`
### Throws:
* `UnicodeError`
"
function convert(::Type{UTF16String}, str::AbstractString)
len, flags, num4byte = unsafe_checkstring(str)
buf = Vector{UInt16}(len+num4byte+1)
out = 0
@inbounds for ch in str
c = UInt32(ch)
if c < 0x10000
push!(buf, UInt16(c))
elseif c <= 0x10ffff
push!(buf, UInt16(0xd7c0 + (c>>10)))
push!(buf, UInt16(0xdc00 + (c & 0x3ff)))
buf[out += 1] = UInt16(c)
else
throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, ch))
# output surrogate pair
buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10))
buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff))
end
end
push!(buf, 0) # NULL termination
@inbounds buf[out + 1] = 0 # NULL termination
UTF16String(buf)
end

utf16(x) = convert(UTF16String, x)
convert(::Type{UTF16String}, s::UTF16String) = s
convert(::Type{UTF16String}, s::AbstractString) = encode16(s)
convert(::Type{Array{UInt16,1}}, s::UTF16String) = s.data
convert(::Type{Array{UInt16}}, s::UTF16String) = s.data
"
Converts a `UTF8String` to a `UTF16String`
# TODO: optimize this
convert(::Type{UTF8String}, s::UTF16String) =
sprint(length(s.data)-1, io->for c in s; write(io,c::Char); end)
### Returns:
* `UTF16String`
sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16)
unsafe_convert{T<:Union{Int16,UInt16}}(::Type{Ptr{T}}, s::UTF16String) =
convert(Ptr{T}, pointer(s))
### Throws:
* `UnicodeError`
"
function convert(::Type{UTF16String}, str::UTF8String)
dat = str.data
# handle zero length string quickly
sizeof(dat) == 0 && return empty_utf16
# Check that is correct UTF-8 encoding and get number of words needed
len, flags, num4byte = unsafe_checkstring(dat)
len += num4byte
buf = Vector{UInt16}(len+1)
@inbounds buf[len+1] = 0
# Optimize case where no characters > 0x7f
flags == 0 && @inbounds return UTF16String(copy!(buf, dat))
out = 0
pos = 0
@inbounds while out < len
ch::UInt32 = dat[pos += 1]
# Handle ASCII characters
if ch <= 0x7f
buf[out += 1] = ch
# Handle range 0x80-0x7ff
elseif ch < 0xe0
buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f)
# Handle range 0x800-0xffff
elseif ch < 0xf0
pos += 2
buf[out += 1] = get_utf8_3byte(dat, pos, ch)
# Handle range 0x10000-0x10ffff
else
pos += 3
ch = get_utf8_4byte(dat, pos, ch)
# output surrogate pair
buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10))
buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff))
end
end
UTF16String(buf)
end

function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
i = 1
n = length(data) # this may include NULL termination; that's okay
while i < n # check for unpaired surrogates
if utf16_is_lead(data[i]) && utf16_is_trail(data[i+1])
i += 2
elseif utf16_is_surrogate(data[i])
return false
"
Converts a UTF-16 encoded vector of `UInt16` to a `UTF8String`
### Returns:
* `UTF8String`
### Throws:
* `UnicodeError`
"
function convert(::Type{UTF8String}, dat::Vector{UInt16})
len = sizeof(dat)
# handle zero length string quickly
len == 0 && return emtpy_utf8
# get number of bytes to allocate
len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len>>>1)
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), dat))
return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3)
end

"
Converts a `UTF16String` to a `UTF8String`
### Returns:
* `UTF8String`
### Throws:
* `UnicodeError`
"
function convert(::Type{UTF8String}, str::UTF16String)
dat = str.data
len = sizeof(dat) >>> 1
# handle zero length string quickly
len <= 1 && return empty_utf8
# get number of bytes to allocate
len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len-1)
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3)
end

"
Converts an already validated vector of `UInt16` or `UInt32` to a `UTF8String`
### Input Arguments:
* `dat` Vector of code units (`UInt16` or `UInt32`), explicit `\0` is not converted
* `len` length of output in bytes
### Returns:
* `UTF8String`
"
function encode_to_utf8{T<:Union{UInt16, UInt32}}(::Type{T}, dat, len)
buf = Vector{UInt8}(len)
out = 0
pos = 0
@inbounds while out < len
ch::UInt32 = dat[pos += 1]
# Handle ASCII characters
if ch <= 0x7f
buf[out += 1] = ch
# Handle 0x80-0x7ff
elseif ch < 0x800
buf[out += 1] = 0xc0 | (ch >>> 6)
buf[out += 1] = 0x80 | (ch & 0x3f)
# Handle 0x10000-0x10ffff (if input is UInt32)
elseif ch > 0xffff # this is only for T == UInt32, should not be generated for UInt16
output_utf8_4byte!(buf, out, ch)
out += 4
# Handle surrogate pairs
elseif is_surrogate_codeunit(ch)
output_utf8_4byte!(buf, out, get_supplementary(ch, dat[pos += 1]))
out += 4
# Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters
else
i += 1
buf[out += 1] = 0xe0 | ((ch >>> 12) & 0x3f)
buf[out += 1] = 0x80 | ((ch >>> 6) & 0x3f)
buf[out += 1] = 0x80 | (ch & 0x3f)
end
end
return i > n || !utf16_is_surrogate(data[i])
UTF8String(buf)
end

function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
!isvalid(UTF16String, data) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0))
len = length(data)
d = Array(UInt16, len + 1)
d[end] = 0 # NULL terminate
UTF16String(copy!(d,1, data,1, len))
function convert(::Type{UTF16String}, str::ASCIIString)
dat = str.data
@inbounds return fast_utf_copy(UTF16String, UInt16, length(dat), dat, true)
end

convert(::Type{Vector{UInt16}}, str::UTF16String) = str.data
convert(::Type{Array{UInt16}}, str::UTF16String) = str.data

convert(::Type{UTF16String}, str::UTF16String) = str

unsafe_convert{T<:Union{Int16,UInt16}}(::Type{Ptr{T}}, s::UTF16String) =
convert(Ptr{T}, pointer(s))

convert(T::Type{UTF16String}, data::AbstractArray{UInt16}) =
convert(T, reshape(data, length(data)))

convert(T::Type{UTF16String}, data::AbstractArray{Int16}) =
convert(T, reinterpret(UInt16, data))

function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
!isvalid(UTF16String, data) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0))
len = length(data)
@inbounds return UTF16String(setindex!(copy!(Vector{UInt16}(len+1),1,data,1,len),0,len+1))
end

function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
isempty(bytes) && return UTF16String(UInt16[0])
isodd(length(bytes)) && throw(UnicodeError(UTF_ERR_ODD_BYTES_16, length(bytes), 0))
Expand All @@ -136,6 +301,9 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
UTF16String(d)
end

convert(::Type{UTF16String}, str::UTF16String) = str

utf16(x) = convert(UTF16String, x)
utf16(p::Ptr{UInt16}, len::Integer) = utf16(pointer_to_array(p, len))
utf16(p::Ptr{Int16}, len::Integer) = utf16(convert(Ptr{UInt16}, p), len)
function utf16(p::Union{Ptr{UInt16}, Ptr{Int16}})
Expand Down
Loading

0 comments on commit a286ce0

Please sign in to comment.