From 7a44c0430fcc825f73e7496f201279f7e67a705e Mon Sep 17 00:00:00 2001 From: Jeff Bezanson Date: Thu, 21 Dec 2017 19:54:09 -0500 Subject: [PATCH] deprecate Vector<->String conversion in favor of something safer add `CodeUnits` and `codeunits` fixes #24388 --- base/c.jl | 15 +++++++------- base/deprecated.jl | 15 ++++++++++++-- base/exports.jl | 1 + base/io.jl | 2 +- base/iobuffer.jl | 2 +- base/loading.jl | 4 ++-- base/repl/LineEdit.jl | 2 +- base/replutil.jl | 2 +- base/strings/basic.jl | 42 ++++++++++++++++++++++++++++++++++++--- base/strings/io.jl | 9 ++++++--- base/strings/search.jl | 4 ++-- base/strings/string.jl | 6 +++++- base/strings/strings.jl | 1 - base/strings/substring.jl | 2 ++ base/sysimg.jl | 7 ++++--- doc/src/stdlib/strings.md | 1 + test/char.jl | 2 +- test/core.jl | 2 +- test/misc.jl | 2 +- test/random.jl | 2 +- test/read.jl | 14 ++++++------- test/strings/basic.jl | 20 +++++++++++++++++-- 22 files changed, 116 insertions(+), 41 deletions(-) diff --git a/base/c.jl b/base/c.jl index 61163a93fcbc4..6a7403678771c 100644 --- a/base/c.jl +++ b/base/c.jl @@ -127,7 +127,7 @@ cconvert(::Type{Cstring}, s::AbstractString) = cconvert(Cstring, String(s)::String) function cconvert(::Type{Cwstring}, s::AbstractString) - v = transcode(Cwchar_t, Vector{UInt8}(String(s))) + v = transcode(Cwchar_t, String(s)) !isempty(v) && v[end] == 0 || push!(v, 0) return v end @@ -140,7 +140,7 @@ containsnul(p::Ptr, len) = containsnul(s::String) = containsnul(unsafe_convert(Ptr{Cchar}, s), sizeof(s)) containsnul(s::AbstractString) = '\0' in s -function unsafe_convert(::Type{Cstring}, s::Union{String,Vector{UInt8}}) +function unsafe_convert(::Type{Cstring}, s::Union{String,AbstractVector{UInt8}}) p = unsafe_convert(Ptr{Cchar}, s) containsnul(p, sizeof(s)) && throw(ArgumentError("embedded NULs are not allowed in C strings: $(repr(s))")) @@ -174,7 +174,7 @@ same argument. This is only available on Windows. """ function cwstring(s::AbstractString) - bytes = Vector{UInt8}(String(s)) + bytes = codeunits(String(s)) 0 in bytes && throw(ArgumentError("embedded NULs are not allowed in C strings: $(repr(s))")) return push!(transcode(UInt16, bytes), 0) end @@ -202,19 +202,20 @@ Only conversion to/from UTF-8 is currently supported. """ function transcode end -transcode(::Type{T}, src::Vector{T}) where {T<:Union{UInt8,UInt16,UInt32,Int32}} = src +transcode(::Type{T}, src::AbstractVector{T}) where {T<:Union{UInt8,UInt16,UInt32,Int32}} = src transcode(::Type{T}, src::String) where {T<:Union{Int32,UInt32}} = T[T(c) for c in src] -transcode(::Type{T}, src::Vector{UInt8}) where {T<:Union{Int32,UInt32}} = transcode(T, String(src)) +transcode(::Type{T}, src::Union{Vector{UInt8},CodeUnits{UInt8,String}}) where {T<:Union{Int32,UInt32}} = + transcode(T, String(src)) function transcode(::Type{UInt8}, src::Vector{<:Union{Int32,UInt32}}) buf = IOBuffer() for c in src; print(buf, Char(c)); end take!(buf) end transcode(::Type{String}, src::String) = src -transcode(T, src::String) = transcode(T, Vector{UInt8}(src)) +transcode(T, src::String) = transcode(T, codeunits(src)) transcode(::Type{String}, src) = String(transcode(UInt8, src)) -function transcode(::Type{UInt16}, src::Vector{UInt8}) +function transcode(::Type{UInt16}, src::Union{Vector{UInt8},CodeUnits{UInt8,String}}) dst = UInt16[] i, n = 1, length(src) n > 0 || return dst diff --git a/base/deprecated.jl b/base/deprecated.jl index d9b2eededd1f1..a7a8f78b3d862 100644 --- a/base/deprecated.jl +++ b/base/deprecated.jl @@ -1555,8 +1555,19 @@ export hex2num @deprecate ctranspose adjoint @deprecate ctranspose! adjoint! -@deprecate convert(::Type{Vector{UInt8}}, s::AbstractString) Vector{UInt8}(s) -@deprecate convert(::Type{Array{UInt8}}, s::AbstractString) Vector{UInt8}(s) +function convert(::Union{Type{Vector{UInt8}}, Type{Array{UInt8}}}, s::AbstractString) + depwarn("Strings can no longer be `convert`ed to byte arrays. Use `unsafe_wrap` or `codeunits` instead.", :Type) + unsafe_wrap(Vector{UInt8}, String(s)) +end +function (::Type{Vector{UInt8}})(s::String) + depwarn("Vector{UInt8}(s::String) will copy data in the future. To avoid copying, use `unsafe_wrap` or `codeunits` instead.", :Type) + unsafe_wrap(Vector{UInt8}, s) +end +function (::Type{Array{UInt8}})(s::String) + depwarn("Array{UInt8}(s::String) will copy data in the future. To avoid copying, use `unsafe_wrap` or `codeunits` instead.", :Type) + unsafe_wrap(Vector{UInt8}, s) +end + @deprecate convert(::Type{Vector{Char}}, s::AbstractString) Vector{Char}(s) @deprecate convert(::Type{Symbol}, s::AbstractString) Symbol(s) @deprecate convert(::Type{String}, s::Symbol) String(s) diff --git a/base/exports.jl b/base/exports.jl index 377461f00ef79..8079432be3db0 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -686,6 +686,7 @@ export chomp, chop, codeunit, + codeunits, dec, digits, digits!, diff --git a/base/io.jl b/base/io.jl index ae71210cb1336..c53f916c56ea8 100644 --- a/base/io.jl +++ b/base/io.jl @@ -706,7 +706,7 @@ function readuntil(io::IO, target::AbstractString) # decide how we can index target if target isa String # convert String to a utf8-byte-iterator - target = Vector{UInt8}(target) + target = codeunits(target) #elseif applicable(codeunit, target) # TODO: a more general version of above optimization # would be to permit accessing any string via codeunit diff --git a/base/iobuffer.jl b/base/iobuffer.jl index 00635942ff561..b94b1f045e895 100644 --- a/base/iobuffer.jl +++ b/base/iobuffer.jl @@ -27,7 +27,7 @@ function GenericIOBuffer(data::T, readable::Bool, writable::Bool, seekable::Bool end # allocate Vector{UInt8}s for IOBuffer storage that can efficiently become Strings -StringVector(n::Integer) = Vector{UInt8}(_string_n(n)) +StringVector(n::Integer) = unsafe_wrap(Vector{UInt8}, _string_n(n)) # IOBuffers behave like Files. They are typically readable and writable. They are seekable. (They can be appendable). diff --git a/base/loading.jl b/base/loading.jl index cf1472437d9d2..a7ff6211f9c62 100644 --- a/base/loading.jl +++ b/base/loading.jl @@ -64,12 +64,12 @@ elseif Sys.isapple() break end # Hack to compensate for inability to create a string from a subarray with no allocations. - Vector{UInt8}(path_basename) == casepreserved_basename && return true + codeunits(path_basename) == casepreserved_basename && return true # If there is no match, it's possible that the file does exist but HFS+ # performed unicode normalization. See https://developer.apple.com/library/mac/qa/qa1235/_index.html. isascii(path_basename) && return false - Vector{UInt8}(Unicode.normalize(path_basename, :NFD)) == casepreserved_basename + codeunits(Unicode.normalize(path_basename, :NFD)) == casepreserved_basename end else # Generic fallback that performs a slow directory listing. diff --git a/base/repl/LineEdit.jl b/base/repl/LineEdit.jl index 35039f068edcd..a6a4ea1bd6165 100644 --- a/base/repl/LineEdit.jl +++ b/base/repl/LineEdit.jl @@ -657,7 +657,7 @@ function edit_splice!(s, r::Region=region(s), ins::AbstractString = ""; rigid_ma elseif buf.mark >= B buf.mark += sizeof(ins) - B + A end - ret = splice!(buf.data, A+1:B, Vector{UInt8}(ins)) # position(), etc, are 0-indexed + ret = splice!(buf.data, A+1:B, codeunits(String(ins))) # position(), etc, are 0-indexed buf.size = buf.size + sizeof(ins) - B + A adjust_pos && seek(buf, position(buf) + sizeof(ins)) String(ret) diff --git a/base/replutil.jl b/base/replutil.jl index 8ad88cef24d0e..29d890db61b99 100644 --- a/base/replutil.jl +++ b/base/replutil.jl @@ -283,7 +283,7 @@ function showerror(io::IO, ex::ErrorException) print(io, ex.msg) if ex.msg == "type String has no field data" println(io) - print(io, "Use `Vector{UInt8}(str)` instead.") + print(io, "Use `codeunits(str)` instead.") end end showerror(io::IO, ex::KeyError) = print(io, "KeyError: key $(repr(ex.key)) not found") diff --git a/base/strings/basic.jl b/base/strings/basic.jl index 703e455a6306b..0fba9536b194e 100644 --- a/base/strings/basic.jl +++ b/base/strings/basic.jl @@ -152,7 +152,6 @@ end getindex(s::AbstractString, i::Colon) = s # TODO: handle other ranges with stride ±1 specially? # TODO: add more @propagate_inbounds annotations? -getindex(s::AbstractString, r::UnitRange{<:Integer}) = SubString(s, r) getindex(s::AbstractString, v::AbstractVector{<:Integer}) = sprint(io->(for i in v; write(io, s[i]) end), sizehint=length(v)) getindex(s::AbstractString, v::AbstractVector{Bool}) = @@ -185,8 +184,8 @@ checkbounds(s::AbstractString, I::Union{Integer,AbstractArray}) = string() = "" string(s::AbstractString) = s -(::Type{Vector{UInt8}})(s::AbstractString) = Vector{UInt8}(String(s)) -(::Type{Array{UInt8}})(s::AbstractString) = Vector{UInt8}(s) +(::Type{Vector{UInt8}})(s::AbstractString) = unsafe_wrap(Vector{UInt8}, String(s)) +(::Type{Array{UInt8}})(s::AbstractString) = unsafe_wrap(Vector{UInt8}, String(s)) (::Type{Vector{Char}})(s::AbstractString) = collect(s) Symbol(s::AbstractString) = Symbol(String(s)) @@ -629,3 +628,40 @@ next(r::Iterators.Reverse{<:AbstractString}, i) = (r.itr[i], prevind(r.itr, i)) start(r::Iterators.Reverse{<:EachStringIndex}) = endof(r.itr.s) done(r::Iterators.Reverse{<:EachStringIndex}, i) = i < start(r.itr.s) next(r::Iterators.Reverse{<:EachStringIndex}, i) = (i, prevind(r.itr.s, i)) + +## code unit access ## + +""" + CodeUnits(s::AbstractString) + +Wrap a string (without copying) in an immutable vector-like object that accesses the code units +of the string's representation. +""" +struct CodeUnits{T,S<:AbstractString} <: DenseVector{T} + s::S + CodeUnits(s::S) where {S<:AbstractString} = new{codeunit(s),S}(s) +end + +length(s::CodeUnits) = ncodeunits(s.s) +sizeof(s::CodeUnits{T}) where {T} = ncodeunits(s.s) * sizeof(T) +size(s::CodeUnits) = (length(s),) +strides(s::CodeUnits) = (1,) +@propagate_inbounds getindex(s::CodeUnits, i::Int) = codeunit(s.s, i) +IndexStyle(::Type{<:CodeUnits}) = IndexLinear() +start(s::CodeUnits) = 1 +next(s::CodeUnits, i) = (@_propagate_inbounds_meta; (s[i], i+1)) +done(s::CodeUnits, i) = (@_inline_meta; i == length(s)+1) + +write(io::IO, s::CodeUnits) = write(io, s.s) + +unsafe_convert(::Type{Ptr{T}}, s::CodeUnits{T}) where {T} = unsafe_convert(Ptr{T}, s.s) +unsafe_convert(::Type{Ptr{Int8}}, s::CodeUnits{UInt8}) = unsafe_convert(Ptr{Int8}, s.s) + +""" + codeunits(s::AbstractString) + +Obtain a vector-like object containing the code units of a string. +Returns a `CodeUnits` wrapper by default, but `codeunits` may optionally be defined +for new string types if necessary. +""" +codeunits(s::AbstractString) = CodeUnits(s) diff --git a/base/strings/io.jl b/base/strings/io.jl index 538d7a55242e6..ca008fc61506f 100644 --- a/base/strings/io.jl +++ b/base/strings/io.jl @@ -188,8 +188,8 @@ julia> String(take!(io)) "Haho" ``` """ -IOBuffer(str::String) = IOBuffer(Vector{UInt8}(str)) -IOBuffer(s::SubString{String}) = IOBuffer(view(Vector{UInt8}(s.string), s.offset + 1 : s.offset + sizeof(s))) +IOBuffer(str::String) = IOBuffer(unsafe_wrap(Vector{UInt8}, str)) +IOBuffer(s::SubString{String}) = IOBuffer(view(unsafe_wrap(Vector{UInt8}, s.string), s.offset + 1 : s.offset + sizeof(s))) # join is implemented using IO @@ -373,7 +373,10 @@ function unescape_string(io, s::AbstractString) end end -macro b_str(s); :(Vector{UInt8}($(unescape_string(s)))); end +macro b_str(s) + v = Vector{UInt8}(codeunits(unescape_string(s))) + QuoteNode(v) +end """ @raw_str -> String diff --git a/base/strings/search.jl b/base/strings/search.jl index 9f895da99e066..f920b77520252 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -31,7 +31,7 @@ function search(a::ByteArray, b::Char, i::Integer = 1) if isascii(b) search(a,UInt8(b),i) else - search(a,Vector{UInt8}(string(b)),i).start + search(a,unsafe_wrap(Vector{UInt8},string(b)),i).start end end @@ -62,7 +62,7 @@ function rsearch(a::ByteArray, b::Char, i::Integer = length(a)) if isascii(b) rsearch(a,UInt8(b),i) else - rsearch(a,Vector{UInt8}(string(b)),i).start + rsearch(a,unsafe_wrap(Vector{UInt8},string(b)),i).start end end diff --git a/base/strings/string.jl b/base/strings/string.jl index 888cda75c6046..1d1a15199f25d 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -60,7 +60,11 @@ This representation is often appropriate for passing strings to C. String(s::AbstractString) = print_to_string(s) String(s::Symbol) = unsafe_string(unsafe_convert(Ptr{UInt8}, s)) -(::Type{Vector{UInt8}})(s::String) = ccall(:jl_string_to_array, Ref{Vector{UInt8}}, (Any,), s) +unsafe_wrap(::Type{Vector{UInt8}}, s::String) = ccall(:jl_string_to_array, Ref{Vector{UInt8}}, (Any,), s) + +(::Type{Vector{UInt8}})(s::CodeUnits{UInt8,String}) = copyto!(Vector{UInt8}(uninitialized, length(s)), s) + +String(s::CodeUnits{UInt8,String}) = s.s ## low-level functions ## diff --git a/base/strings/strings.jl b/base/strings/strings.jl index 91f10436a4e73..0edc0c4e4bba6 100644 --- a/base/strings/strings.jl +++ b/base/strings/strings.jl @@ -1,7 +1,6 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license include("strings/substring.jl") -include("strings/basic.jl") include("strings/search.jl") include("strings/unicode.jl") include("strings/util.jl") diff --git a/base/strings/substring.jl b/base/strings/substring.jl index f7abe7dfb1e4f..f4cf3c9311537 100644 --- a/base/strings/substring.jl +++ b/base/strings/substring.jl @@ -155,3 +155,5 @@ function reverse(s::Union{String,SubString{String}})::String end end end + +getindex(s::AbstractString, r::UnitRange{<:Integer}) = SubString(s, r) diff --git a/base/sysimg.jl b/base/sysimg.jl index bb9851697d205..ab6906a7bd905 100644 --- a/base/sysimg.jl +++ b/base/sysimg.jl @@ -183,9 +183,6 @@ macro MIME_str(s) :(MIME{$(Expr(:quote, Symbol(s)))}) end -include("char.jl") -include("strings/string.jl") - # SIMD loops include("simdloop.jl") using .SimdLoop @@ -212,6 +209,10 @@ include("iterators.jl") using .Iterators: zip, enumerate using .Iterators: Flatten, product # for generators +include("char.jl") +include("strings/basic.jl") +include("strings/string.jl") + # Definition of StridedArray StridedReshapedArray{T,N,A<:Union{DenseArray,FastContiguousSubArray}} = ReshapedArray{T,N,A} StridedReinterpretArray{T,N,A<:Union{DenseArray,FastContiguousSubArray}} = ReinterpretArray{T,N,S,A} where S diff --git a/doc/src/stdlib/strings.md b/doc/src/stdlib/strings.md index 74ab2ca518f22..79e2b1b9fb448 100644 --- a/doc/src/stdlib/strings.md +++ b/doc/src/stdlib/strings.md @@ -15,6 +15,7 @@ Base.transcode Base.unsafe_string Base.ncodeunits(::AbstractString) Base.codeunit +Base.codeunits Base.ascii Base.@r_str Base.@raw_str diff --git a/test/char.jl b/test/char.jl index 9163122c95659..c5e98d302e41b 100644 --- a/test/char.jl +++ b/test/char.jl @@ -201,7 +201,7 @@ end @testset "read incomplete character at end of stream or file" begin local file = tempname() local iob = IOBuffer([0xf0]) - local bytes(c::Char) = Vector{UInt8}(string(c)) + local bytes(c::Char) = codeunits(string(c)) @test bytes(read(iob, Char)) == [0xf0] @test eof(iob) try diff --git a/test/core.jl b/test/core.jl index 407d6c137257f..68b00969a094f 100644 --- a/test/core.jl +++ b/test/core.jl @@ -4114,7 +4114,7 @@ b = "aaa" c = [0x2, 0x1, 0x3] @test check_nul(a) -@test check_nul(Vector{UInt8}(b)) +@test check_nul(unsafe_wrap(Vector{UInt8},b)) @test check_nul(c) d = [0x2, 0x1, 0x3] @test check_nul(d) diff --git a/test/misc.jl b/test/misc.jl index 12099f2aac4bf..9a1cd659988e5 100644 --- a/test/misc.jl +++ b/test/misc.jl @@ -394,7 +394,7 @@ end let s = "abcα🐨\0x\0" for T in (UInt8, UInt16, UInt32, Int32) - @test transcode(T, s) == transcode(T, Vector{UInt8}(s)) + @test transcode(T, s) == transcode(T, codeunits(s)) @test transcode(String, transcode(T, s)) == s end end diff --git a/test/random.jl b/test/random.jl index 2cbd423897b5d..614a6f36c8284 100644 --- a/test/random.jl +++ b/test/random.jl @@ -613,7 +613,7 @@ let b = ['0':'9';'A':'Z';'a':'z'] @test length(randstring(rng...)) == 8 @test length(randstring(rng..., 20)) == 20 @test issubset(randstring(rng...), b) - for c = ['a':'z', "qwèrtï", Set(Vector{UInt8}("gcat"))], + for c = ['a':'z', "qwèrtï", Set(codeunits("gcat"))], len = [8, 20] s = len == 8 ? randstring(rng..., c) : randstring(rng..., c, len) @test length(s) == len diff --git a/test/read.jl b/test/read.jl index d590a378687d8..b6b6fef9bebd2 100644 --- a/test/read.jl +++ b/test/read.jl @@ -165,7 +165,7 @@ for (name, f) in l @test readuntil(io(t), s) == m @test readuntil(io(t), SubString(s, start(s), endof(s))) == m @test readuntil(io(t), GenericString(s)) == m - @test readuntil(io(t), Vector{UInt8}(s)) == Vector{UInt8}(m) + @test readuntil(io(t), unsafe_wrap(Vector{UInt8},s)) == unsafe_wrap(Vector{UInt8},m) @test readuntil(io(t), collect(s)::Vector{Char}) == Vector{Char}(m) end cleanup() @@ -223,7 +223,7 @@ for (name, f) in l verbose && println("$name read...") - @test read(io()) == Vector{UInt8}(text) + @test read(io()) == unsafe_wrap(Vector{UInt8},text) @test read(io()) == read(filename) @@ -331,7 +331,7 @@ for (name, f) in l @test read("$filename.to", String) == text verbose && println("$name write(::IOBuffer, ...)") - to = IOBuffer(copy(Vector{UInt8}(text)), false, true) + to = IOBuffer(Vector{UInt8}(codeunits(text)), false, true) write(to, io()) @test String(take!(to)) == text @@ -400,14 +400,14 @@ test_read_nbyte() let s = "qwerty" - @test read(IOBuffer(s)) == Vector{UInt8}(s) - @test read(IOBuffer(s), 10) == Vector{UInt8}(s) - @test read(IOBuffer(s), 1) == Vector{UInt8}(s)[1:1] + @test read(IOBuffer(s)) == codeunits(s) + @test read(IOBuffer(s), 10) == codeunits(s) + @test read(IOBuffer(s), 1) == codeunits(s)[1:1] # Test growing output array x = UInt8[] n = readbytes!(IOBuffer(s), x, 10) - @test x == Vector{UInt8}(s) + @test x == codeunits(s) @test n == length(x) end diff --git a/test/strings/basic.jl b/test/strings/basic.jl index 092828c6c4c76..26fe5fb5214b5 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -206,7 +206,7 @@ struct tstStringType <: AbstractString data::Array{UInt8,1} end @testset "AbstractString functions" begin - tstr = tstStringType(Vector{UInt8}("12")) + tstr = tstStringType(unsafe_wrap(Vector{UInt8},"12")) @test_throws MethodError ncodeunits(tstr) @test_throws MethodError codeunit(tstr) @test_throws MethodError codeunit(tstr, 1) @@ -697,7 +697,7 @@ end end end -@test Vector{UInt8}("\xcc\xdd\xee\xff\x80") == [0xcc,0xdd,0xee,0xff,0x80] +@test unsafe_wrap(Vector{UInt8},"\xcc\xdd\xee\xff\x80") == [0xcc,0xdd,0xee,0xff,0x80] @test next("a", 1)[2] == 2 @test nextind("a", 1) == 2 @@ -794,3 +794,19 @@ end @test nextind("\xf8\x9f\x98\x84", 1) == 2 @test next("\xf8\x9f\x98\x84z", 1)[2] == 2 @test nextind("\xf8\x9f\x98\x84z", 1) == 2 + +# codeunit vectors + +let s = "∀x∃y", u = codeunits(s) + @test u isa Base.CodeUnits{UInt8,String} + @test length(u) == ncodeunits(s) == 8 + @test sizeof(u) == sizeof(s) + @test eltype(u) === UInt8 + @test size(u) == (length(u),) + @test strides(u) == (1,) + @test u[1] == 0xe2 + @test u[2] == 0x88 + @test u[8] == 0x79 + @test_throws ErrorException (u[1] = 0x00) + @test collect(u) == b"∀x∃y" +end