From d1eb31c35ff5342d038cd4dab4e2fe1ecaae1b69 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Fri, 16 Feb 2018 22:34:53 +0100 Subject: [PATCH] Always resize input to zero length in String(::Vector{UInt8}) This makes the behavior more predictable than only resizing Vector{UInt8} inputs when they have been allocated via StringVector, as the caller may have obtained them from other code without knowing how they were created. This ensures code will not rely on the fact that a copy is made in many common cases. The behavior is also simpler to document. --- NEWS.md | 6 ++++++ base/strings/string.jl | 29 +++++++++++++++-------------- src/array.c | 12 +++++++++--- test/strings/basic.jl | 4 +++- 4 files changed, 33 insertions(+), 18 deletions(-) diff --git a/NEWS.md b/NEWS.md index 19ca58a40f33c7..1cb3ccb38cc6e4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -434,6 +434,11 @@ Library improvements * The function `thisind(s::AbstractString, i::Integer)` returns the largest valid index less or equal than `i` in the string `s` or `0` if no such index exists ([#24414]). + * `String(array)` now accepts an arbitrary `AbstractVector{UInt8}` and "steals" the + memory buffer of mutable arrays, leaving the byte vector with an empty buffer which + is guaranteed not to be shared with the `String` object; if the byte vector is + immutable, it simply shares memory with the string and is not truncated ([#26093]). + * `Irrational` is now a subtype of `AbstractIrrational` ([#24245]). * Introduced the `empty` function, the functional pair to `empty!` which returns a new, @@ -1324,3 +1329,4 @@ Command-line option changes [#25745]: https://github.com/JuliaLang/julia/issues/25745 [#25896]: https://github.com/JuliaLang/julia/issues/25896 [#25998]: https://github.com/JuliaLang/julia/issues/25998 +[#26093]: https://github.com/JuliaLang/julia/issues/26093 \ No newline at end of file diff --git a/base/strings/string.jl b/base/strings/string.jl index 487bfd8819d1d3..bea5d18e82f83a 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -16,19 +16,22 @@ const ByteArray = Union{Vector{UInt8},Vector{Int8}} # String constructor docstring from boot.jl, workaround for #16730 # and the unavailability of @doc in boot.jl context. """ - String(v::Vector{UInt8}) - -Create a new `String` from a vector `v` of bytes containing -UTF-8 encoded characters. This function takes "ownership" of -the array, which means that you should not subsequently modify -`v` (since strings are supposed to be immutable in Julia) for -as long as the string exists. - -If you need to subsequently modify `v`, use `String(copy(v))` instead. + String(v::AbstractVector{UInt8}) + +Create a new `String` object from a byte vector `v` containing UTF-8 encoded +characters. If `v` is `Vector{UInt8}` it will be truncated to zero length and +future modification of `v` cannot affect the contents of the resulting string. +To avoid truncation use `String(copy(v))`. + +When possible, the memory of `v` will be used without copying when the `String` +object is created. This is guaranteed to be the case for byte vectors returned +by [`take!`](@ref) on a writable [`IOBuffer`](@ref) and by calls to +[`read(io, nb)`](@ref). This allows zero-copy conversion of I/O data to strings. +In other cases, `Vector{UInt8}` data may be copied, but `v` is truncated anyway +to guarantee consistent behavior. """ -function String(v::Array{UInt8,1}) - ccall(:jl_array_to_string, Ref{String}, (Any,), v) -end +String(v::AbstractVector{UInt8}) = String(copyto!(StringVector(length(v)), v)) +String(v::Vector{UInt8}) = ccall(:jl_array_to_string, Ref{String}, (Any,), v) """ unsafe_string(p::Ptr{UInt8}, [length::Integer]) @@ -64,8 +67,6 @@ unsafe_wrap(::Type{Vector{UInt8}}, s::String) = ccall(:jl_string_to_array, Ref{V (::Type{Vector{UInt8}})(s::CodeUnits{UInt8,String}) = copyto!(Vector{UInt8}(uninitialized, length(s)), s) -String(a::AbstractVector{UInt8}) = String(copyto!(StringVector(length(a)), a)) - String(s::CodeUnits{UInt8,String}) = s.s ## low-level functions ## diff --git a/src/array.c b/src/array.c index 22cae65aa00691..5c36497ad7b294 100644 --- a/src/array.c +++ b/src/array.c @@ -434,13 +434,14 @@ JL_DLLEXPORT jl_array_t *jl_pchar_to_array(const char *str, size_t len) JL_DLLEXPORT jl_value_t *jl_array_to_string(jl_array_t *a) { + size_t len = jl_array_len(a); if (a->flags.how == 3 && a->offset == 0 && a->elsize == 1 && (jl_array_ndims(a) != 1 || - ((a->maxsize + sizeof(void*) + 1 <= GC_MAX_SZCLASS) == (jl_array_len(a) + sizeof(void*) + 1 <= GC_MAX_SZCLASS)))) { + ((a->maxsize + sizeof(void*) + 1 <= GC_MAX_SZCLASS) == (len + sizeof(void*) + 1 <= GC_MAX_SZCLASS)))) { jl_value_t *o = jl_array_data_owner(a); if (jl_is_string(o)) { a->flags.isshared = 1; - *(size_t*)o = jl_array_len(a); + *(size_t*)o = len; a->nrows = 0; #ifdef STORE_ARRAY_LEN a->length = 0; @@ -449,7 +450,12 @@ JL_DLLEXPORT jl_value_t *jl_array_to_string(jl_array_t *a) return o; } } - return jl_pchar_to_string((const char*)jl_array_data(a), jl_array_len(a)); + a->nrows = 0; +#ifdef STORE_ARRAY_LEN + a->length = 0; +#endif + a->maxsize = 0; + return jl_pchar_to_string((const char*)jl_array_data(a), len); } JL_DLLEXPORT jl_value_t *jl_pchar_to_string(const char *str, size_t len) diff --git a/test/strings/basic.jl b/test/strings/basic.jl index 92273526fa442a..5b7e2b8b24c059 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -3,8 +3,10 @@ using Random @testset "constructors" begin - @test String([0x61,0x62,0x63,0x21]) == "abc!" + v = [0x61,0x62,0x63,0x21] + @test String(v) == "abc!" && isempty(v) @test String("abc!") == "abc!" + @test String(0x61:0x63) == "abc" @test isempty(string()) @test eltype(GenericString) == Char