From 9f26d58df0142c5b30b5fe64fafba772bbecdc68 Mon Sep 17 00:00:00 2001 From: ScottPJones Date: Sat, 6 Jun 2015 07:28:03 +0200 Subject: [PATCH] Rebase against #11575 --- base/sysimg.jl | 1 - base/utf16.jl | 37 ++++--- base/utf32.jl | 26 ++++- base/utfcheck.jl | 6 +- base/utfconvert.jl | 248 --------------------------------------------- base/utferror.jl | 1 - test/strings.jl | 36 ++----- 7 files changed, 60 insertions(+), 295 deletions(-) delete mode 100644 base/utfconvert.jl diff --git a/base/sysimg.jl b/base/sysimg.jl index 722acb83d0b0ba..44fba004d0dc14 100644 --- a/base/sysimg.jl +++ b/base/sysimg.jl @@ -87,7 +87,6 @@ include("osutils.jl") include("utferror.jl") include("utftypes.jl") include("utfcheck.jl") -include("utfconvert.jl") include("char.jl") include("ascii.jl") include("utf8.jl") diff --git a/base/utf16.jl b/base/utf16.jl index 352770e28f2ce4..352ade64ba52bc 100644 --- a/base/utf16.jl +++ b/base/utf16.jl @@ -5,6 +5,10 @@ @inbounds return flag ? S(setindex!(copy!(Vector{T}(len+1),1,dat,1,len),0,len+1)) : S(setindex!(copy!(Vector{T}(len), dat), 0, len)) end +# Get rest of character ch from 3-byte UTF-8 sequence in dat +@inline function get_utf8_3byte(dat, pos, ch) + @inbounds return ((ch & 0xf) << 12) | (UInt32(dat[pos-1] & 0x3f) << 6) | (dat[pos] & 0x3f) +end # Get rest of character ch from 4-byte UTF-8 sequence in dat @inline function get_utf8_4byte(dat, pos, ch) @inbounds return (((ch & 0x7) << 18) @@ -23,6 +27,8 @@ end end end +const empty_utf16 = UTF16String(UInt16[0]) + function length(s::UTF16String) d = s.data len = length(d) - 1 @@ -71,7 +77,7 @@ function reverse(s::UTF16String) if is_surrogate_lead(ch) out[i],out[i-1] = out[i-1],ch else - throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, ch)) + out[i] = ch end end UTF16String(out) @@ -143,7 +149,7 @@ function convert(::Type{UTF16String}, str::UTF8String) # handle zero length string quickly sizeof(dat) == 0 && return empty_utf16 # Check that is correct UTF-8 encoding and get number of words needed - len, flags, num4byte = check_string_utf8(dat) + len, flags, num4byte = check_string(dat) len += num4byte buf = Vector{UInt16}(len+1) @inbounds buf[len+1] = 0 @@ -177,19 +183,23 @@ end function convert(::Type{UTF8String}, dat::Vector{UInt16}) " -@brief Converts a UTF-16 encoded vector of UInt16 to a UTF8String +Converts a UTF-16 encoded vector of UInt16 to a UTF8String + +### Input Arguments: +* ::Type{UTF8String} +* dat::Vector{UInt16} -@param[in] ::Type{UTF8String} -@param[in] dat::Vector{UInt16} +### Returns: +* UTF8String -@return ::UTF8String -@throws ArgumentError -""" -> +### Throws: +* UnicodeError +" len = sizeof(dat) # handle zero length string quickly - len == 0 && return UTF8String("") + len == 0 && return emtpy_utf8 # get number of bytes to allocate - len, flags, num4byte, num3byte, num2byte = check_string_utf16(dat, len>>>1) + len, flags, num4byte, num3byte, num2byte = check_string(dat, len>>>1) flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), dat)) return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3) end @@ -211,9 +221,9 @@ Converts a UTF16String to a UTF8String dat = str.data len = sizeof(dat) >>> 1 # handle zero length string quickly - len <= 1 && return UTF8String("") + len <= 1 && return empty_utf8 # get number of bytes to allocate - len, flags, num4byte, num3byte, num2byte = check_string_utf16(dat, len-1) + len, flags, num4byte, num3byte, num2byte = check_string(dat, len-1) flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3) end @@ -226,7 +236,8 @@ function encode_to_utf8{T<:Union(UInt16, UInt32)}(::Type{T}, dat, len) * dat Vector{T} * len length of output in bytes -@return ::UTF8String +### Returns: +* UTF8String " buf = Vector{UInt8}(len) out = 0 diff --git a/base/utf32.jl b/base/utf32.jl index 318d2cffe6d4d2..90962e2538ea81 100644 --- a/base/utf32.jl +++ b/base/utf32.jl @@ -5,9 +5,26 @@ next(s::UTF32String, i::Int) = (s.data[i], i+1) endof(s::UTF32String) = length(s.data) - 1 length(s::UTF32String) = length(s.data) - 1 -reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s))) +utf32(x) = convert(UTF32String, x) +convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)]) +convert(::Type{UTF32String}, s::UTF32String) = s -sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char) +function convert(::Type{UTF32String}, s::AbstractString) + a = Array(Char, length(s) + 1) + i = 0 + for c in s + a[i += 1] = c + end + a[end] = Char(0) # NULL terminate + UTF32String(a) +end + +function convert(::Type{UTF32String}, data::AbstractVector{Char}) + len = length(data) + d = Array(Char, len + 1) + d[end] = Char(0) # NULL terminate + UTF32String(copy!(d,1, data,1, len)) +end convert{T<:Union(Int32,UInt32)}(::Type{UTF32String}, data::AbstractVector{T}) = convert(UTF32String, reinterpret(Char, data)) @@ -27,6 +44,9 @@ end convert(::Type{Vector{Char}}, str::UTF32String) = str.data convert(::Type{Array{Char}}, str::UTF32String) = str.data +reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s))) + +sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char) unsafe_convert{T<:Union(Int32,UInt32,Char)}(::Type{Ptr{T}}, s::UTF32String) = convert(Ptr{T}, pointer(s)) @@ -59,8 +79,6 @@ function isvalid(::Type{UTF32String}, str::Union(Vector{Char}, Vector{UInt32})) end isvalid(str::Vector{Char}) = isvalid(UTF32String, str) -utf32(x) = convert(UTF32String, x) - utf32(p::Ptr{Char}, len::Integer) = utf32(pointer_to_array(p, len)) utf32(p::Union(Ptr{UInt32}, Ptr{Int32}), len::Integer) = utf32(convert(Ptr{Char}, p), len) function utf32(p::Union(Ptr{Char}, Ptr{UInt32}, Ptr{Int32})) diff --git a/base/utfcheck.jl b/base/utfcheck.jl index 417b0331a22bec..7d181735ae0593 100644 --- a/base/utfcheck.jl +++ b/base/utfcheck.jl @@ -133,12 +133,12 @@ end function check_string{T <: Union(Vector{UInt16}, Vector{UInt32}, AbstractString)}( dat::T, len = endof(dat), - pos = start(dat) + pos = (T <: AbstractString) ? start(dat) : 1 ; options::Integer = 0) " Validates and calculates number of characters in a UTF-16 or UTF-32 encoded vector/string ### Input Arguments: -* str Vector of UInt16, UInt32, or an AbstractString +* dat Vector of UInt16, UInt32, or an AbstractString ### Optional Input Arguments: * len length @@ -169,7 +169,7 @@ function check_string{T <: Union(Vector{UInt16}, Vector{UInt32}, AbstractString) elseif ch < 0x800 num2byte += 1 flags |= UTF_UNICODE2 - elseif T != Vector{UInt16} && ch > 0x0ffff + elseif ch > 0x0ffff (ch > 0x10ffff) && throw(UnicodeError(UTF_ERR_INVALID, pos, ch)) num4byte += 1 elseif !is_surrogate_codeunit(ch) diff --git a/base/utfconvert.jl b/base/utfconvert.jl deleted file mode 100644 index fa445f07498bb5..00000000000000 --- a/base/utfconvert.jl +++ /dev/null @@ -1,248 +0,0 @@ -# This file is a part of Julia. License is MIT: http://julialang.org/license - -# Functions to convert to different UTF encodings - -# Quickly copy and set trailing \0 -@inline function fast_utf_copy{S <: Union(UTF16String, UTF32String), T <: Union(UInt16, Char)}(::Type{S}, ::Type{T}, len, dat, flag::Bool=false) - @inbounds return flag ? S(setindex!(copy!(Vector{T}(len+1),1,dat,1,len),0,len+1)) : S(setindex!(copy!(Vector{T}(len), dat), 0, len)) -end - -#= -""" -@brief Converts an AbstractString to a UTF32String - -@param[in] ::Type{UTF32String} -@param[in] str::AbstractString - -@return ::UTF32String -@throws ArgumentError -""" -=# -function convert(::Type{UTF32String}, str::AbstractString) - len, flags = check_string(str) - buf = Vector{Char}(len+1) - out = 0 - @inbounds for ch in str ; buf[out += 1] = ch ; end - @inbounds buf[out + 1] = 0 # NULL termination - UTF32String(buf) -end - -#= -@doc """ -@brief Encodes a UTF-32 encoded vector of UInt32 to a UTF8String - -@param[in] ::Type{UTF8String} -@param[in] dat::Vector{UInt32} - -@return ::UTF8String -@throws ArgumentError -""" -> -=# -function convert(::Type{UTF8String}, dat::Vector{UInt32}) - len = sizeof(dat) - # handle zero length string quickly - len == 0 && return UTF8String("") - # get number of bytes to allocate - len, flags, num4byte, num3byte, num2byte = check_string(dat, len>>>2) - flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) - return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3) -end - -#= -@doc """ -@brief Converts a UTF32String to a UTF8String - -@param[in] ::Type{UTF8String} -@param[in] str::UTF32String - -@return ::UTF8String -@throws ArgumentError -""" -> -=# -function convert(::Type{UTF8String}, str::UTF32String) - dat = reinterpret(UInt32, str.data) - len = sizeof(dat) >>> 2 - # handle zero length string quickly - len <= 1 && return UTF8String("") - # get number of bytes to allocate - len, flags, num4byte, num3byte, num2byte = check_string(dat, len-1) - flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) - return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3) -end - - -#= -""" -@brief Converts a UTF8String to a UTF32String - -@param[in] ::Type{UTF32String} -@param[in] str::UTF8String - -@return ::UTF32String -@throws ArgumentError -""" -=# -function convert(::Type{UTF32String}, str::UTF8String) - dat = str.data - # handle zero length string quickly - sizeof(dat) == 0 && return empty_utf32 - # Validate UTF-8 encoding, and get number of words to create - len, flags = check_string(dat) - # Optimize case where no characters > 0x7f - totlen = len+1 - flags == 0 && return fast_utf_copy(UTF32String, Char, totlen, dat) - # has multi-byte UTF-8 sequences - buf = Vector{Char}(totlen) - @inbounds buf[totlen] = 0 # NULL termination - local ch::UInt32, surr::UInt32 - out = 0 - pos = 0 - @inbounds while out < len - ch = dat[pos += 1] - # Handle ASCII characters - if ch <= 0x7f - buf[out += 1] = ch - # Handle range 0x80-0x7ff - elseif ch < 0xe0 - buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f) - # Handle range 0x800-0xffff - elseif ch < 0xf0 - pos += 2 - ch = get_utf8_3byte(dat, pos, ch) - # Handle surrogate pairs (should have been encoded in 4 bytes) - if is_surrogate_lead(ch) - # Build up 32-bit character from ch and trailing surrogate in next 3 bytes - pos += 3 - surr = ((UInt32(dat[pos-2] & 0xf) << 12) - | (UInt32(dat[pos-1] & 0x3f) << 6) - | (dat[pos] & 0x3f)) - ch = get_supplementary(ch, surr) - end - buf[out += 1] = ch - # Handle range 0x10000-0x10ffff - else - pos += 3 - buf[out += 1] = get_utf8_4byte(dat, pos, ch) - end - end - UTF32String(buf) -end - -#= -""" -@brief Converts a UTF16String to UTF32String - -@param[in] ::Type{UTF32String} -@param[in] str::UTF16String - -@return ::UTF32String -@throws ArgumentError -""" -=# -function convert(::Type{UTF32String}, str::UTF16String) - dat = str.data - len = sizeof(dat) - # handle zero length string quickly (account for trailing \0) - len <= 2 && return empty_utf32 - # get number of words to create - len, flags, num4byte = check_string(dat, len>>>1) - # No surrogate pairs, do optimized copy - (flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String(copy!(Vector{Char}(len), dat)) - local ch::UInt32 - buf = Vector{Char}(len) - out = 0 - pos = 0 - @inbounds while out < len - ch = dat[pos += 1] - # check for surrogate pair - if is_surrogate_lead(ch) ; ch = get_supplementary(ch, dat[pos += 1]) ; end - buf[out += 1] = ch - end - UTF32String(buf) -end - -#= -""" -@brief Converts a UTF-32 encoded vector of UInt32 to a UTF16String - -@param[in] ::Type{UTF16String} -@param[in] dat::Vector{UInt32} - -@return ::UTF16String -@throws ArgumentError -""" -=# -function convert(::Type{UTF16String}, dat::Vector{UInt32}) - len = sizeof(dat) - # handle zero length string quickly - len <= 4 && return empty_utf16 - # get number of words to allocate - len, flags, num4byte = check_string(dat, len>>>2) - len += num4byte + 1 - # optimized path, no surrogates - num4byte == 0 && return fast_utf_copy(UTF16String, UInt16, len, dat) - return encode_to_utf16(dat, len) -end - -#= -""" -@brief Converts a UTF32String to UTF16String - -@param[in] ::Type{UTF16String} -@param[in] str::UTF32String - -@return ::UTF16String -@throws ArgumentError -""" -=# -function convert(::Type{UTF16String}, str::UTF32String) - dat = reinterpret(UInt32, str.data) - len = sizeof(dat) - # handle zero length string quickly - len <= 4 && return empty_utf16 - # get number of words to allocate - len, flags, num4byte = check_string(dat, len>>>2) - # optimized path, no surrogates - num4byte == 0 && @inbounds return UTF16String(copy!(Vector{UInt16}(len), dat)) - return encode_to_utf16(dat, len + num4byte) -end - -#= -@doc """ -@brief Converts an already validated UTF-32 encoded vector of UInt32 to a UTF16String - -@param[in] dat::Vector{UInt32} UTF-32 encoded data -@param[in] len length of output in 16-bit words - -@return ::UTF16String -""" -> -=# -function encode_to_utf16(dat, len) - buf = Vector{UInt16}(len) - @inbounds buf[len] = 0 # NULL termination - out = 0 - pos = 0 - @inbounds while out < len - ch = UInt32(dat[pos += 1]) - if ch > 0xffff - # Output surrogate pair for 0x10000-0x10ffff - buf[out += 1] = 0xd7c0 + (ch >>> 10) - ch = 0xdc00 + (ch & 0x3ff) - end - buf[out += 1] = ch - end - UTF16String(buf) -end - -convert(::Type{UTF8String}, dat::Vector{Char}) = convert(UTF8String, reinterpret(UInt32, dat)) -convert(::Type{UTF16String}, dat::Vector{Char}) = convert(UTF16String, reinterpret(UInt32, dat)) -convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)]) - -function convert(::Type{UTF32String}, str::ASCIIString) - dat = str.data - fast_utf_copy(UTF32String, Char, length(dat)+1, dat) -end - -convert(::Type{UTF32String}, dat::AbstractVector{Char}) = fast_utf_copy(UTF32String, Char, length(dat), dat, true) - - diff --git a/base/utferror.jl b/base/utferror.jl index cee5b2837b6c70..352bd03a163086 100644 --- a/base/utferror.jl +++ b/base/utferror.jl @@ -15,7 +15,6 @@ const UTF_ERR_NULL_16_TERMINATE = "UTF16String data must be NULL-terminated" const UTF_ERR_NULL_32_TERMINATE = "UTF32String data must be NULL-terminated" const UTF_ERR_ODD_BYTES_16 = "UTF16String can't have odd number of bytes <<1>>" const UTF_ERR_ODD_BYTES_32 = "UTF32String must have multiple of 4 bytes <<1>>" -<<<<<<< HEAD const UTF_ERR_INVALID_CHAR = "invalid Unicode character (0x<<2>> > 0x10ffff)" const UTF_ERR_INVALID_8 = "invalid UTF-8 data" const UTF_ERR_INVALID_16 = "invalid UTF-16 data" diff --git a/test/strings.jl b/test/strings.jl index d780e370edbc7e..31c4c928713b6b 100644 --- a/test/strings.jl +++ b/test/strings.jl @@ -1820,16 +1820,12 @@ byt = 0x0 @test_throws UnicodeError Base.check_string(UInt32[0x110000]) # issue #11551 (#11004,#10959) -function tstcvt(strUTF8::UTF8String, strUTF16::UTF16String, strUTF32::UTF32String) +function tstcvt(strUTF8::UTF8String, strUTF16::UTF16String) @test utf16(strUTF8) == strUTF16 - @test utf32(strUTF8) == strUTF32 @test utf8(strUTF16) == strUTF8 - @test utf32(strUTF16) == strUTF32 - @test utf8(strUTF32) == strUTF8 - @test utf16(strUTF32) == strUTF16 end -# Create some ASCII, UTF8, UTF16, and UTF32 strings +# Create some ASCII, UTF8 and UTF16 strAscii = "abcdefgh" strA_UTF8 = ("abcdefgh\uff")[1:8] strL_UTF8 = "abcdef\uff\uff" @@ -1847,37 +1843,28 @@ str2_UTF16 = utf16(str2_UTF8) str3_UTF16 = utf16(str3_UTF8) str4_UTF16 = utf16(str4_UTF8) strS_UTF16 = utf16(strS_UTF8) -strA_UTF32 = utf32(strA_UTF8) -strL_UTF32 = utf32(strL_UTF8) -str2_UTF32 = utf32(str2_UTF8) -str3_UTF32 = utf32(str3_UTF8) -str4_UTF32 = utf32(str4_UTF8) -strS_UTF32 = utf32(strS_UTF8) + @test utf8(strAscii) == strAscii @test utf16(strAscii) == strAscii -@test utf32(strAscii) == strAscii -tstcvt(strA_UTF8,strA_UTF16,strA_UTF32) -tstcvt(strL_UTF8,strL_UTF16,strL_UTF32) -tstcvt(str2_UTF8,str2_UTF16,str2_UTF32) -tstcvt(str3_UTF8,str3_UTF16,str3_UTF32) -tstcvt(str4_UTF8,str4_UTF16,str4_UTF32) + +tstcvt(strA_UTF8,strA_UTF16) +tstcvt(strL_UTF8,strL_UTF16) +tstcvt(str2_UTF8,str2_UTF16) +tstcvt(str3_UTF8,str3_UTF16) +tstcvt(str4_UTF8,str4_UTF16) + # Test converting surrogate pairs @test utf16(strS_UTF8) == strC_UTF8 -@test utf32(strS_UTF8) == strC_UTF8 @test utf8(strS_UTF16) == strC_UTF8 -@test utf32(strS_UTF16) == strC_UTF8 -@test utf8(strS_UTF32) == strC_UTF8 -@test utf16(strS_UTF32) == strC_UTF8 # Test converting overlong \0 # @test utf8(strZ_UTF8) == strz_UTF8 # currently broken! (in utf8.jl) @test utf16(strZ_UTF8) == strz_UTF8 -@test utf32(strZ_UTF8) == strz_UTF8 # Test invalid sequences byt = 0x0 -for T in (UTF16String, UTF32String) +for T in (UTF16String,) # UTF32String try # Continuation byte not after lead for byt in 0x80:0xbf @@ -1967,4 +1954,3 @@ for T in (UTF16String, UTF32String) throw(exp) end end -