diff --git a/base/boot.jl b/base/boot.jl index f487b01279f374..4ebb9abb493b5a 100644 --- a/base/boot.jl +++ b/base/boot.jl @@ -221,10 +221,10 @@ end abstract DirectIndexString <: AbstractString -immutable String <: AbstractString - data::Array{UInt8,1} - # required to make String("foo") work (#15120): - String(d::Array{UInt8,1}) = new(d) +type String <: AbstractString + len::Int + String(p::Ptr{UInt8}, len::Int) = + ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), p, len) end # This should always be inlined diff --git a/base/strings/basic.jl b/base/strings/basic.jl index 1ac5660378cdc3..edc800543e3d9b 100644 --- a/base/strings/basic.jl +++ b/base/strings/basic.jl @@ -57,8 +57,8 @@ function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}) ccall(:jl_cstr_to_string, Ref{String}, (Ptr{UInt8},), p) end -convert(::Type{Vector{UInt8}}, s::AbstractString) = String(s).data -convert(::Type{Array{UInt8}}, s::AbstractString) = String(s).data +#convert(::Type{Vector{UInt8}}, s::AbstractString) = String(s).data +#convert(::Type{Array{UInt8}}, s::AbstractString) = String(s).data convert(::Type{String}, s::AbstractString) = String(s) convert(::Type{Vector{Char}}, s::AbstractString) = collect(s) convert(::Type{Symbol}, s::AbstractString) = Symbol(s) diff --git a/base/strings/string.jl b/base/strings/string.jl index 9ff67987338e71..2f6550aea0af48 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -34,46 +34,35 @@ const utf8_trailing = [ ## required core functionality ## function endof(s::String) - d = s.data - i = length(d) - @inbounds while i > 0 && is_valid_continuation(d[i]) + p = pointer(s) + i = s.len + while i > 0 && is_valid_continuation(unsafe_load(p,i)) i -= 1 end i end function length(s::String) - d = s.data + p = pointer(s) cnum = 0 - for i = 1:length(d) - @inbounds cnum += !is_valid_continuation(d[i]) + for i = 1:s.len + cnum += !is_valid_continuation(unsafe_load(p,i)) end cnum end -@noinline function slow_utf8_next(d::Vector{UInt8}, b::UInt8, i::Int) - # potentially faster version - # d = s.data - # a::UInt32 = d[i] - # if a < 0x80; return Char(a); end - # #if a&0xc0==0x80; return '\ufffd'; end - # b::UInt32 = a<<6 + d[i+1] - # if a < 0xe0; return Char(b - 0x00003080); end - # c::UInt32 = b<<6 + d[i+2] - # if a < 0xf0; return Char(c - 0x000e2080); end - # return Char(c<<6 + d[i+3] - 0x03c82080) - +@noinline function slow_utf8_next(p::Ptr{UInt8}, b::UInt8, i::Int, l::Int) if is_valid_continuation(b) - throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i])) + throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, unsafe_load(p,i))) end trailing = utf8_trailing[b + 1] - if length(d) < i + trailing + if l < i + trailing return '\ufffd', i+1 end c::UInt32 = 0 for j = 1:(trailing + 1) c <<= 6 - c += d[i] + c += unsafe_load(p,i) i += 1 end c -= utf8_offset[trailing + 1] @@ -84,12 +73,15 @@ end # function is split into this critical fast-path # for pure ascii data, such as parsing numbers, # and a longer function that can handle any utf8 data - d = s.data - b = d[i] + if i < 1 || i > s.len + throw(BoundsError(s,i)) + end + p = pointer(s) + b = unsafe_load(p, i) if b < 0x80 - return Char(b), i + 1 + return Char(b), i+1 end - return slow_utf8_next(d, b, i) + return slow_utf8_next(p, b, i, s.len) end function first_utf8_byte(ch::Char) @@ -102,9 +94,9 @@ function first_utf8_byte(ch::Char) end function reverseind(s::String, i::Integer) - j = length(s.data) + 1 - i - d = s.data - while is_valid_continuation(d[j]) + j = s.len + 1 - i + p = pointer(s) + while is_valid_continuation(unsafe_load(p,j)) j -= 1 end return j @@ -112,10 +104,10 @@ end ## overload methods for efficiency ## -sizeof(s::String) = sizeof(s.data) +sizeof(s::String) = s.len isvalid(s::String, i::Integer) = - (1 <= i <= endof(s.data)) && !is_valid_continuation(s.data[i]) + (1 <= i <= s.len) && !is_valid_continuation(unsafe_load(pointer(s),i)) const empty_utf8 = String(UInt8[]) @@ -237,10 +229,10 @@ function reverse(s::String) String(buf) end -write(io::IO, s::String) = write(io, s.data) +write(io::IO, s::String) = write(io, pointer(s), s.len) -pointer(x::String) = pointer(x.data) -pointer(x::String, i::Integer) = pointer(x.data)+(i-1) +pointer(s::String) = convert(Ptr{UInt8}, pointer_from_objref(s)+sizeof(Int)) +pointer(x::String, i::Integer) = pointer(x)+(i-1) convert(::Type{String}, s::String) = s convert(::Type{String}, v::Vector{UInt8}) = String(v) diff --git a/src/array.c b/src/array.c index a9c18062b0a01c..44d8c424676185 100644 --- a/src/array.c +++ b/src/array.c @@ -369,20 +369,14 @@ JL_DLLEXPORT jl_array_t *jl_pchar_to_array(const char *str, size_t len) JL_DLLEXPORT jl_value_t *jl_array_to_string(jl_array_t *a) { - jl_ptls_t ptls = jl_get_ptls_states(); - if (!jl_typeis(a, jl_array_uint8_type)) - jl_type_error("jl_array_to_string", (jl_value_t*)jl_array_uint8_type, (jl_value_t*)a); - jl_value_t *s = jl_gc_alloc(ptls, sizeof(void*), jl_string_type); - jl_set_nth_field(s, 0, (jl_value_t*)a); - return s; + return jl_pchar_to_string(jl_array_data(a), jl_array_len(a)); } JL_DLLEXPORT jl_value_t *jl_pchar_to_string(const char *str, size_t len) { - jl_array_t *a = jl_pchar_to_array(str, len); - JL_GC_PUSH1(&a); - jl_value_t *s = jl_array_to_string(a); - JL_GC_POP(); + jl_value_t *s = jl_gc_allocobj(jl_get_ptls_states(), sizeof(void*)+len, jl_string_type); + *(size_t*)s = len; + memcpy((char*)s + sizeof(void*), str, len); return s; } diff --git a/src/julia.h b/src/julia.h index a6222b9a58c3d0..88c185624cc083 100644 --- a/src/julia.h +++ b/src/julia.h @@ -756,8 +756,8 @@ STATIC_INLINE void jl_array_uint8_set(void *a, size_t i, uint8_t x) #define jl_data_ptr(v) ((jl_value_t**)v) #define jl_array_ptr_data(a) ((jl_value_t**)((jl_array_t*)a)->data) -#define jl_string_data(s) ((char*)((jl_array_t*)jl_data_ptr(s)[0])->data) -#define jl_string_len(s) (jl_array_len((jl_array_t*)(jl_data_ptr(s)[0]))) +#define jl_string_data(s) ((char*)s + sizeof(void*)) +#define jl_string_len(s) (*(size_t*)s) #define jl_iostr_data(s) ((char*)((jl_array_t*)jl_data_ptr(s)[0])->data) #define jl_gf_mtable(f) (((jl_datatype_t*)jl_typeof(f))->name->mt)