From 9f26d58df0142c5b30b5fe64fafba772bbecdc68 Mon Sep 17 00:00:00 2001
From: ScottPJones <scottjones@alum.mit.edu>
Date: Sat, 6 Jun 2015 07:28:03 +0200
Subject: [PATCH] Rebase against #11575

---
 base/sysimg.jl     |   1 -
 base/utf16.jl      |  37 ++++---
 base/utf32.jl      |  26 ++++-
 base/utfcheck.jl   |   6 +-
 base/utfconvert.jl | 248 ---------------------------------------------
 base/utferror.jl   |   1 -
 test/strings.jl    |  36 ++-----
 7 files changed, 60 insertions(+), 295 deletions(-)
 delete mode 100644 base/utfconvert.jl

diff --git a/base/sysimg.jl b/base/sysimg.jl
index 722acb83d0b0ba..44fba004d0dc14 100644
--- a/base/sysimg.jl
+++ b/base/sysimg.jl
@@ -87,7 +87,6 @@ include("osutils.jl")
 include("utferror.jl")
 include("utftypes.jl")
 include("utfcheck.jl")
-include("utfconvert.jl")
 include("char.jl")
 include("ascii.jl")
 include("utf8.jl")
diff --git a/base/utf16.jl b/base/utf16.jl
index 352770e28f2ce4..352ade64ba52bc 100644
--- a/base/utf16.jl
+++ b/base/utf16.jl
@@ -5,6 +5,10 @@
      @inbounds return flag ? S(setindex!(copy!(Vector{T}(len+1),1,dat,1,len),0,len+1)) : S(setindex!(copy!(Vector{T}(len), dat), 0, len))
 end
 
+# Get rest of character ch from 3-byte UTF-8 sequence in dat
+@inline function get_utf8_3byte(dat, pos, ch)
+    @inbounds return ((ch & 0xf) << 12) | (UInt32(dat[pos-1] & 0x3f) << 6) | (dat[pos] & 0x3f)
+end
 # Get rest of character ch from 4-byte UTF-8 sequence in dat
 @inline function get_utf8_4byte(dat, pos, ch)
     @inbounds return (((ch & 0x7) << 18)
@@ -23,6 +27,8 @@ end
     end
 end
 
+const empty_utf16 = UTF16String(UInt16[0])
+
 function length(s::UTF16String)
     d = s.data
     len = length(d) - 1
@@ -71,7 +77,7 @@ function reverse(s::UTF16String)
         if is_surrogate_lead(ch)
             out[i],out[i-1] = out[i-1],ch
         else
-            throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, ch))
+            out[i] = ch
         end
     end
     UTF16String(out)
@@ -143,7 +149,7 @@ function convert(::Type{UTF16String}, str::UTF8String)
     # handle zero length string quickly
     sizeof(dat) == 0 && return empty_utf16
     # Check that is correct UTF-8 encoding and get number of words needed
-    len, flags, num4byte = check_string_utf8(dat)
+    len, flags, num4byte = check_string(dat)
     len += num4byte
     buf = Vector{UInt16}(len+1)
     @inbounds buf[len+1] = 0
@@ -177,19 +183,23 @@ end
 
 function convert(::Type{UTF8String}, dat::Vector{UInt16})
 "
-@brief      Converts a UTF-16 encoded vector of UInt16 to a UTF8String
+Converts a UTF-16 encoded vector of UInt16 to a UTF8String
+
+### Input Arguments:
+*   ::Type{UTF8String}
+*   dat::Vector{UInt16}
 
-@param[in]  ::Type{UTF8String}
-@param[in]  dat::Vector{UInt16}
+### Returns:
+*   UTF8String
 
-@return     ::UTF8String
-@throws     ArgumentError
-""" ->
+### Throws:
+*   UnicodeError
+"
     len = sizeof(dat)
     # handle zero length string quickly
-    len == 0 && return UTF8String("")
+    len == 0 && return emtpy_utf8
     # get number of bytes to allocate
-    len, flags, num4byte, num3byte, num2byte = check_string_utf16(dat, len>>>1)
+    len, flags, num4byte, num3byte, num2byte = check_string(dat, len>>>1)
     flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), dat))
     return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3)
 end
@@ -211,9 +221,9 @@ Converts a UTF16String to a UTF8String
     dat = str.data
     len = sizeof(dat) >>> 1
     # handle zero length string quickly
-    len <= 1 && return UTF8String("")
+    len <= 1 && return empty_utf8
     # get number of bytes to allocate
-    len, flags, num4byte, num3byte, num2byte = check_string_utf16(dat, len-1)
+    len, flags, num4byte, num3byte, num2byte = check_string(dat, len-1)
     flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
     return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3)
 end
@@ -226,7 +236,8 @@ function encode_to_utf8{T<:Union(UInt16, UInt32)}(::Type{T}, dat, len)
 *    dat         Vector{T}
 *    len         length of output in bytes
 
-@return     ::UTF8String
+### Returns:
+*   UTF8String
 "
     buf = Vector{UInt8}(len)
     out = 0
diff --git a/base/utf32.jl b/base/utf32.jl
index 318d2cffe6d4d2..90962e2538ea81 100644
--- a/base/utf32.jl
+++ b/base/utf32.jl
@@ -5,9 +5,26 @@ next(s::UTF32String, i::Int) = (s.data[i], i+1)
 endof(s::UTF32String) = length(s.data) - 1
 length(s::UTF32String) = length(s.data) - 1
 
-reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))
+utf32(x) = convert(UTF32String, x)
+convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)])
+convert(::Type{UTF32String}, s::UTF32String) = s
 
-sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char)
+function convert(::Type{UTF32String}, s::AbstractString)
+    a = Array(Char, length(s) + 1)
+    i = 0
+    for c in s
+        a[i += 1] = c
+    end
+    a[end] = Char(0) # NULL terminate
+    UTF32String(a)
+end
+
+function convert(::Type{UTF32String}, data::AbstractVector{Char})
+    len = length(data)
+    d = Array(Char, len + 1)
+    d[end] = Char(0) # NULL terminate
+    UTF32String(copy!(d,1, data,1, len))
+end
 
 convert{T<:Union(Int32,UInt32)}(::Type{UTF32String}, data::AbstractVector{T}) =
     convert(UTF32String, reinterpret(Char, data))
@@ -27,6 +44,9 @@ end
 convert(::Type{Vector{Char}}, str::UTF32String) = str.data
 convert(::Type{Array{Char}},  str::UTF32String) = str.data
 
+reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))
+
+sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char)
 unsafe_convert{T<:Union(Int32,UInt32,Char)}(::Type{Ptr{T}}, s::UTF32String) =
     convert(Ptr{T}, pointer(s))
 
@@ -59,8 +79,6 @@ function isvalid(::Type{UTF32String}, str::Union(Vector{Char}, Vector{UInt32}))
 end
 isvalid(str::Vector{Char}) = isvalid(UTF32String, str)
 
-utf32(x) = convert(UTF32String, x)
-
 utf32(p::Ptr{Char}, len::Integer) = utf32(pointer_to_array(p, len))
 utf32(p::Union(Ptr{UInt32}, Ptr{Int32}), len::Integer) = utf32(convert(Ptr{Char}, p), len)
 function utf32(p::Union(Ptr{Char}, Ptr{UInt32}, Ptr{Int32}))
diff --git a/base/utfcheck.jl b/base/utfcheck.jl
index 417b0331a22bec..7d181735ae0593 100644
--- a/base/utfcheck.jl
+++ b/base/utfcheck.jl
@@ -133,12 +133,12 @@ end
 function check_string{T <: Union(Vector{UInt16}, Vector{UInt32}, AbstractString)}(
                       dat::T,
                       len = endof(dat),
-                      pos = start(dat)
+                      pos = (T <: AbstractString) ? start(dat) : 1
                       ; options::Integer = 0)
 " Validates and calculates number of characters in a UTF-16 or UTF-32 encoded vector/string
 
 ### Input Arguments:
-* str    Vector of UInt16, UInt32, or an AbstractString
+* dat    Vector of UInt16, UInt32, or an AbstractString
 
 ### Optional Input Arguments:
 * len    length
@@ -169,7 +169,7 @@ function check_string{T <: Union(Vector{UInt16}, Vector{UInt32}, AbstractString)
             elseif ch < 0x800
                 num2byte += 1
                 flags |= UTF_UNICODE2
-            elseif T != Vector{UInt16} && ch > 0x0ffff
+            elseif ch > 0x0ffff
                 (ch > 0x10ffff) && throw(UnicodeError(UTF_ERR_INVALID, pos, ch))
                 num4byte += 1
             elseif !is_surrogate_codeunit(ch)
diff --git a/base/utfconvert.jl b/base/utfconvert.jl
deleted file mode 100644
index fa445f07498bb5..00000000000000
--- a/base/utfconvert.jl
+++ /dev/null
@@ -1,248 +0,0 @@
-# This file is a part of Julia. License is MIT: http://julialang.org/license
-
-# Functions to convert to different UTF encodings
-
-# Quickly copy and set trailing \0
-@inline function fast_utf_copy{S <: Union(UTF16String, UTF32String), T <: Union(UInt16, Char)}(::Type{S}, ::Type{T}, len, dat, flag::Bool=false)
-     @inbounds return flag ? S(setindex!(copy!(Vector{T}(len+1),1,dat,1,len),0,len+1)) : S(setindex!(copy!(Vector{T}(len), dat), 0, len))
-end
-
-#=
-"""
-@brief      Converts an AbstractString to a UTF32String
-
-@param[in]  ::Type{UTF32String}
-@param[in]  str::AbstractString
-
-@return     ::UTF32String
-@throws     ArgumentError
-"""
-=#
-function convert(::Type{UTF32String}, str::AbstractString)
-    len, flags = check_string(str)
-    buf = Vector{Char}(len+1)
-    out = 0
-    @inbounds for ch in str ; buf[out += 1] = ch ; end
-    @inbounds buf[out + 1] = 0 # NULL termination
-    UTF32String(buf)
-end
-
-#=
-@doc """
-@brief      Encodes a UTF-32 encoded vector of UInt32 to a UTF8String
-
-@param[in]  ::Type{UTF8String}
-@param[in]  dat::Vector{UInt32}
-
-@return     ::UTF8String
-@throws     ArgumentError
-""" ->
-=#
-function convert(::Type{UTF8String}, dat::Vector{UInt32})
-    len = sizeof(dat)
-    # handle zero length string quickly
-    len == 0 && return UTF8String("")
-    # get number of bytes to allocate
-    len, flags, num4byte, num3byte, num2byte = check_string(dat, len>>>2)
-    flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
-    return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3)
-end
-
-#=
-@doc """
-@brief      Converts a UTF32String to a UTF8String
-
-@param[in]  ::Type{UTF8String}
-@param[in]  str::UTF32String
-
-@return     ::UTF8String
-@throws     ArgumentError
-""" ->
-=#
-function convert(::Type{UTF8String},  str::UTF32String)
-    dat = reinterpret(UInt32, str.data)
-    len = sizeof(dat) >>> 2
-    # handle zero length string quickly
-    len <= 1 && return UTF8String("")
-    # get number of bytes to allocate
-    len, flags, num4byte, num3byte, num2byte = check_string(dat, len-1)
-    flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
-    return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3)
-end
-
-
-#=
-"""
-@brief      Converts a UTF8String to a UTF32String
-
-@param[in]  ::Type{UTF32String}
-@param[in]  str::UTF8String
-
-@return     ::UTF32String
-@throws     ArgumentError
-"""
-=#
-function convert(::Type{UTF32String}, str::UTF8String)
-    dat = str.data
-    # handle zero length string quickly
-    sizeof(dat) == 0 && return empty_utf32
-    # Validate UTF-8 encoding, and get number of words to create
-    len, flags = check_string(dat)
-    # Optimize case where no characters > 0x7f
-    totlen = len+1
-    flags == 0 && return fast_utf_copy(UTF32String, Char, totlen, dat)
-    # has multi-byte UTF-8 sequences
-    buf = Vector{Char}(totlen)
-    @inbounds buf[totlen] = 0 # NULL termination
-    local ch::UInt32, surr::UInt32
-    out = 0
-    pos = 0
-    @inbounds while out < len
-        ch = dat[pos += 1]
-        # Handle ASCII characters
-        if ch <= 0x7f
-            buf[out += 1] = ch
-        # Handle range 0x80-0x7ff
-        elseif ch < 0xe0
-            buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f)
-        # Handle range 0x800-0xffff
-        elseif ch < 0xf0
-            pos += 2
-            ch = get_utf8_3byte(dat, pos, ch)
-            # Handle surrogate pairs (should have been encoded in 4 bytes)
-            if is_surrogate_lead(ch)
-                # Build up 32-bit character from ch and trailing surrogate in next 3 bytes
-                pos += 3
-                surr = ((UInt32(dat[pos-2] & 0xf) << 12)
-                        | (UInt32(dat[pos-1] & 0x3f) << 6)
-                        | (dat[pos] & 0x3f))
-                ch = get_supplementary(ch, surr)
-            end
-            buf[out += 1] = ch
-        # Handle range 0x10000-0x10ffff
-        else
-            pos += 3
-            buf[out += 1] = get_utf8_4byte(dat, pos, ch)
-        end
-    end
-    UTF32String(buf)
-end
-
-#=
-"""
-@brief      Converts a UTF16String to UTF32String
-
-@param[in]  ::Type{UTF32String}
-@param[in]  str::UTF16String
-
-@return     ::UTF32String
-@throws     ArgumentError
-"""
-=#
-function convert(::Type{UTF32String}, str::UTF16String)
-    dat = str.data
-    len = sizeof(dat)
-    # handle zero length string quickly (account for trailing \0)
-    len <= 2 && return empty_utf32
-    # get number of words to create
-    len, flags, num4byte = check_string(dat, len>>>1)
-    # No surrogate pairs, do optimized copy
-    (flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String(copy!(Vector{Char}(len), dat))
-    local ch::UInt32
-    buf = Vector{Char}(len)
-    out = 0
-    pos = 0
-    @inbounds while out < len
-        ch = dat[pos += 1]
-        # check for surrogate pair
-        if is_surrogate_lead(ch) ; ch = get_supplementary(ch, dat[pos += 1]) ; end
-        buf[out += 1] = ch
-    end
-    UTF32String(buf)
-end
-
-#=
-"""
-@brief      Converts a UTF-32 encoded vector of UInt32 to a UTF16String
-
-@param[in]  ::Type{UTF16String}
-@param[in]  dat::Vector{UInt32}
-
-@return     ::UTF16String
-@throws     ArgumentError
-"""
-=#
-function convert(::Type{UTF16String}, dat::Vector{UInt32})
-    len = sizeof(dat)
-    # handle zero length string quickly
-    len <= 4 && return empty_utf16
-    # get number of words to allocate
-    len, flags, num4byte = check_string(dat, len>>>2)
-    len += num4byte + 1
-    # optimized path, no surrogates
-    num4byte == 0 && return fast_utf_copy(UTF16String, UInt16, len, dat)
-    return encode_to_utf16(dat, len)
-end
-
-#=
-"""
-@brief      Converts a UTF32String to UTF16String
-
-@param[in]  ::Type{UTF16String}
-@param[in]  str::UTF32String
-
-@return     ::UTF16String
-@throws     ArgumentError
-"""
-=#
-function convert(::Type{UTF16String}, str::UTF32String)
-    dat = reinterpret(UInt32, str.data)
-    len = sizeof(dat)
-    # handle zero length string quickly
-    len <= 4 && return empty_utf16
-    # get number of words to allocate
-    len, flags, num4byte = check_string(dat, len>>>2)
-    # optimized path, no surrogates
-    num4byte == 0 && @inbounds return UTF16String(copy!(Vector{UInt16}(len), dat))
-    return encode_to_utf16(dat, len + num4byte)
-end
-
-#=
-@doc """
-@brief      Converts an already validated UTF-32 encoded vector of UInt32 to a UTF16String
-
-@param[in]  dat::Vector{UInt32} UTF-32 encoded data
-@param[in]  len                 length of output in 16-bit words
-
-@return     ::UTF16String
-""" ->
-=#
-function encode_to_utf16(dat, len)
-    buf = Vector{UInt16}(len)
-    @inbounds buf[len] = 0 # NULL termination
-    out = 0
-    pos = 0
-    @inbounds while out < len
-        ch = UInt32(dat[pos += 1])
-        if ch > 0xffff
-            # Output surrogate pair for 0x10000-0x10ffff
-            buf[out += 1] = 0xd7c0 + (ch >>> 10)
-            ch = 0xdc00 + (ch & 0x3ff)
-        end
-        buf[out += 1] = ch
-    end
-    UTF16String(buf)
-end
-
-convert(::Type{UTF8String},  dat::Vector{Char})   = convert(UTF8String, reinterpret(UInt32, dat))
-convert(::Type{UTF16String}, dat::Vector{Char})   = convert(UTF16String, reinterpret(UInt32, dat))
-convert(::Type{UTF32String}, c::Char)             = UTF32String(Char[c, Char(0)])
-
-function convert(::Type{UTF32String}, str::ASCIIString)
-    dat = str.data
-    fast_utf_copy(UTF32String, Char, length(dat)+1, dat)
-end
-
-convert(::Type{UTF32String}, dat::AbstractVector{Char}) = fast_utf_copy(UTF32String, Char, length(dat), dat, true)
-
-
diff --git a/base/utferror.jl b/base/utferror.jl
index cee5b2837b6c70..352bd03a163086 100644
--- a/base/utferror.jl
+++ b/base/utferror.jl
@@ -15,7 +15,6 @@ const UTF_ERR_NULL_16_TERMINATE = "UTF16String data must be NULL-terminated"
 const UTF_ERR_NULL_32_TERMINATE = "UTF32String data must be NULL-terminated"
 const UTF_ERR_ODD_BYTES_16      = "UTF16String can't have odd number of bytes <<1>>"
 const UTF_ERR_ODD_BYTES_32      = "UTF32String must have multiple of 4 bytes <<1>>"
-<<<<<<< HEAD
 const UTF_ERR_INVALID_CHAR      = "invalid Unicode character (0x<<2>> > 0x10ffff)"
 const UTF_ERR_INVALID_8         = "invalid UTF-8 data"
 const UTF_ERR_INVALID_16        = "invalid UTF-16 data"
diff --git a/test/strings.jl b/test/strings.jl
index d780e370edbc7e..31c4c928713b6b 100644
--- a/test/strings.jl
+++ b/test/strings.jl
@@ -1820,16 +1820,12 @@ byt = 0x0
 @test_throws UnicodeError Base.check_string(UInt32[0x110000])
 
 # issue #11551 (#11004,#10959)
-function tstcvt(strUTF8::UTF8String, strUTF16::UTF16String, strUTF32::UTF32String)
+function tstcvt(strUTF8::UTF8String, strUTF16::UTF16String)
     @test utf16(strUTF8) == strUTF16
-    @test utf32(strUTF8) == strUTF32
     @test utf8(strUTF16) == strUTF8
-    @test utf32(strUTF16) == strUTF32
-    @test utf8(strUTF32)  == strUTF8
-    @test utf16(strUTF32) == strUTF16
 end
 
-# Create some ASCII, UTF8, UTF16, and UTF32 strings
+# Create some ASCII, UTF8 and UTF16
 strAscii = "abcdefgh"
 strA_UTF8 = ("abcdefgh\uff")[1:8]
 strL_UTF8 = "abcdef\uff\uff"
@@ -1847,37 +1843,28 @@ str2_UTF16 = utf16(str2_UTF8)
 str3_UTF16 = utf16(str3_UTF8)
 str4_UTF16 = utf16(str4_UTF8)
 strS_UTF16 = utf16(strS_UTF8)
-strA_UTF32 = utf32(strA_UTF8)
-strL_UTF32 = utf32(strL_UTF8)
-str2_UTF32 = utf32(str2_UTF8)
-str3_UTF32 = utf32(str3_UTF8)
-str4_UTF32 = utf32(str4_UTF8)
-strS_UTF32 = utf32(strS_UTF8)
+
 @test utf8(strAscii) == strAscii
 @test utf16(strAscii) == strAscii
-@test utf32(strAscii) == strAscii
-tstcvt(strA_UTF8,strA_UTF16,strA_UTF32)
-tstcvt(strL_UTF8,strL_UTF16,strL_UTF32)
-tstcvt(str2_UTF8,str2_UTF16,str2_UTF32)
-tstcvt(str3_UTF8,str3_UTF16,str3_UTF32)
-tstcvt(str4_UTF8,str4_UTF16,str4_UTF32)
+
+tstcvt(strA_UTF8,strA_UTF16)
+tstcvt(strL_UTF8,strL_UTF16)
+tstcvt(str2_UTF8,str2_UTF16)
+tstcvt(str3_UTF8,str3_UTF16)
+tstcvt(str4_UTF8,str4_UTF16)
+
 # Test converting surrogate pairs
 @test utf16(strS_UTF8) == strC_UTF8
-@test utf32(strS_UTF8) == strC_UTF8
 @test utf8(strS_UTF16) == strC_UTF8
-@test utf32(strS_UTF16) == strC_UTF8
-@test utf8(strS_UTF32)  == strC_UTF8
-@test utf16(strS_UTF32) == strC_UTF8
 
 # Test converting overlong \0
 # @test utf8(strZ_UTF8)  == strz_UTF8   # currently broken! (in utf8.jl)
 @test utf16(strZ_UTF8) == strz_UTF8
-@test utf32(strZ_UTF8) == strz_UTF8
 
 # Test invalid sequences
 
 byt = 0x0
-for T in (UTF16String, UTF32String)
+for T in (UTF16String,) # UTF32String
     try
     # Continuation byte not after lead
     for byt in 0x80:0xbf
@@ -1967,4 +1954,3 @@ for T in (UTF16String, UTF32String)
         throw(exp)
     end
 end
-