Skip to content

Commit

Permalink
possible simple approach to faster String allocation
Browse files Browse the repository at this point in the history
[ci skip]
  • Loading branch information
JeffBezanson committed Nov 29, 2016
1 parent 2a4b68a commit 9b881ff
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 51 deletions.
8 changes: 4 additions & 4 deletions base/boot.jl
Original file line number Diff line number Diff line change
Expand Up @@ -221,10 +221,10 @@ end

abstract DirectIndexString <: AbstractString

immutable String <: AbstractString
data::Array{UInt8,1}
# required to make String("foo") work (#15120):
String(d::Array{UInt8,1}) = new(d)
type String <: AbstractString
len::Int
String(p::Ptr{UInt8}, len::Int) =
ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), p, len)
end

# This should always be inlined
Expand Down
4 changes: 2 additions & 2 deletions base/strings/basic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}})
ccall(:jl_cstr_to_string, Ref{String}, (Ptr{UInt8},), p)
end

convert(::Type{Vector{UInt8}}, s::AbstractString) = String(s).data
convert(::Type{Array{UInt8}}, s::AbstractString) = String(s).data
#convert(::Type{Vector{UInt8}}, s::AbstractString) = String(s).data
#convert(::Type{Array{UInt8}}, s::AbstractString) = String(s).data
convert(::Type{String}, s::AbstractString) = String(s)
convert(::Type{Vector{Char}}, s::AbstractString) = collect(s)
convert(::Type{Symbol}, s::AbstractString) = Symbol(s)
Expand Down
58 changes: 25 additions & 33 deletions base/strings/string.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,46 +34,35 @@ const utf8_trailing = [
## required core functionality ##

function endof(s::String)
d = s.data
i = length(d)
@inbounds while i > 0 && is_valid_continuation(d[i])
p = pointer(s)
i = s.len
while i > 0 && is_valid_continuation(unsafe_load(p,i))
i -= 1
end
i
end

function length(s::String)
d = s.data
p = pointer(s)
cnum = 0
for i = 1:length(d)
@inbounds cnum += !is_valid_continuation(d[i])
for i = 1:s.len
cnum += !is_valid_continuation(unsafe_load(p,i))
end
cnum
end

@noinline function slow_utf8_next(d::Vector{UInt8}, b::UInt8, i::Int)
# potentially faster version
# d = s.data
# a::UInt32 = d[i]
# if a < 0x80; return Char(a); end
# #if a&0xc0==0x80; return '\ufffd'; end
# b::UInt32 = a<<6 + d[i+1]
# if a < 0xe0; return Char(b - 0x00003080); end
# c::UInt32 = b<<6 + d[i+2]
# if a < 0xf0; return Char(c - 0x000e2080); end
# return Char(c<<6 + d[i+3] - 0x03c82080)

@noinline function slow_utf8_next(p::Ptr{UInt8}, b::UInt8, i::Int, l::Int)
if is_valid_continuation(b)
throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i]))
throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, unsafe_load(p,i)))
end
trailing = utf8_trailing[b + 1]
if length(d) < i + trailing
if l < i + trailing
return '\ufffd', i+1
end
c::UInt32 = 0
for j = 1:(trailing + 1)
c <<= 6
c += d[i]
c += unsafe_load(p,i)
i += 1
end
c -= utf8_offset[trailing + 1]
Expand All @@ -84,12 +73,15 @@ end
# function is split into this critical fast-path
# for pure ascii data, such as parsing numbers,
# and a longer function that can handle any utf8 data
d = s.data
b = d[i]
if i < 1 || i > s.len
throw(BoundsError(s,i))
end
p = pointer(s)
b = unsafe_load(p, i)
if b < 0x80
return Char(b), i + 1
return Char(b), i+1
end
return slow_utf8_next(d, b, i)
return slow_utf8_next(p, b, i, s.len)
end

function first_utf8_byte(ch::Char)
Expand All @@ -102,20 +94,20 @@ function first_utf8_byte(ch::Char)
end

function reverseind(s::String, i::Integer)
j = length(s.data) + 1 - i
d = s.data
while is_valid_continuation(d[j])
j = s.len + 1 - i
p = pointer(s)
while is_valid_continuation(unsafe_load(p,j))
j -= 1
end
return j
end

## overload methods for efficiency ##

sizeof(s::String) = sizeof(s.data)
sizeof(s::String) = s.len

isvalid(s::String, i::Integer) =
(1 <= i <= endof(s.data)) && !is_valid_continuation(s.data[i])
(1 <= i <= s.len) && !is_valid_continuation(unsafe_load(pointer(s),i))

const empty_utf8 = String(UInt8[])

Expand Down Expand Up @@ -237,10 +229,10 @@ function reverse(s::String)
String(buf)
end

write(io::IO, s::String) = write(io, s.data)
write(io::IO, s::String) = write(io, pointer(s), s.len)

pointer(x::String) = pointer(x.data)
pointer(x::String, i::Integer) = pointer(x.data)+(i-1)
pointer(s::String) = convert(Ptr{UInt8}, pointer_from_objref(s)+sizeof(Int))
pointer(x::String, i::Integer) = pointer(x)+(i-1)

convert(::Type{String}, s::String) = s
convert(::Type{String}, v::Vector{UInt8}) = String(v)
14 changes: 4 additions & 10 deletions src/array.c
Original file line number Diff line number Diff line change
Expand Up @@ -369,20 +369,14 @@ JL_DLLEXPORT jl_array_t *jl_pchar_to_array(const char *str, size_t len)

JL_DLLEXPORT jl_value_t *jl_array_to_string(jl_array_t *a)
{
jl_ptls_t ptls = jl_get_ptls_states();
if (!jl_typeis(a, jl_array_uint8_type))
jl_type_error("jl_array_to_string", (jl_value_t*)jl_array_uint8_type, (jl_value_t*)a);
jl_value_t *s = jl_gc_alloc(ptls, sizeof(void*), jl_string_type);
jl_set_nth_field(s, 0, (jl_value_t*)a);
return s;
return jl_pchar_to_string(jl_array_data(a), jl_array_len(a));
}

JL_DLLEXPORT jl_value_t *jl_pchar_to_string(const char *str, size_t len)
{
jl_array_t *a = jl_pchar_to_array(str, len);
JL_GC_PUSH1(&a);
jl_value_t *s = jl_array_to_string(a);
JL_GC_POP();
jl_value_t *s = jl_gc_allocobj(jl_get_ptls_states(), sizeof(void*)+len, jl_string_type);
*(size_t*)s = len;
memcpy((char*)s + sizeof(void*), str, len);
return s;
}

Expand Down
4 changes: 2 additions & 2 deletions src/julia.h
Original file line number Diff line number Diff line change
Expand Up @@ -756,8 +756,8 @@ STATIC_INLINE void jl_array_uint8_set(void *a, size_t i, uint8_t x)
#define jl_data_ptr(v) ((jl_value_t**)v)

#define jl_array_ptr_data(a) ((jl_value_t**)((jl_array_t*)a)->data)
#define jl_string_data(s) ((char*)((jl_array_t*)jl_data_ptr(s)[0])->data)
#define jl_string_len(s) (jl_array_len((jl_array_t*)(jl_data_ptr(s)[0])))
#define jl_string_data(s) ((char*)s + sizeof(void*))
#define jl_string_len(s) (*(size_t*)s)
#define jl_iostr_data(s) ((char*)((jl_array_t*)jl_data_ptr(s)[0])->data)

#define jl_gf_mtable(f) (((jl_datatype_t*)jl_typeof(f))->name->mt)
Expand Down

0 comments on commit 9b881ff

Please sign in to comment.