Skip to content

Commit

Permalink
improve string effects (#48996)
Browse files Browse the repository at this point in the history
  • Loading branch information
oscardssmith authored Mar 23, 2023
1 parent 20b8139 commit 56950e2
Show file tree
Hide file tree
Showing 5 changed files with 90 additions and 26 deletions.
2 changes: 1 addition & 1 deletion base/hashing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ end
const memhash = UInt === UInt64 ? :memhash_seed : :memhash32_seed
const memhash_seed = UInt === UInt64 ? 0x71e729fd56419c81 : 0x56419c81

function hash(s::String, h::UInt)
@assume_effects :total function hash(s::String, h::UInt)
h += memhash_seed
ccall(memhash, UInt, (Ptr{UInt8}, Csize_t, UInt32), s, sizeof(s), h % UInt32) + h
end
38 changes: 22 additions & 16 deletions base/strings/string.jl
Original file line number Diff line number Diff line change
Expand Up @@ -113,28 +113,28 @@ pointer(s::String, i::Integer) = pointer(s) + Int(i)::Int - 1
ncodeunits(s::String) = Core.sizeof(s)
codeunit(s::String) = UInt8

@inline function codeunit(s::String, i::Integer)
@assume_effects :foldable @inline function codeunit(s::String, i::Integer)
@boundscheck checkbounds(s, i)
b = GC.@preserve s unsafe_load(pointer(s, i))
return b
end

## comparison ##

_memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}, len) =
@assume_effects :total _memcmp(a::String, b::String) = @invoke _memcmp(a::Union{Ptr{UInt8},AbstractString},b::Union{Ptr{UInt8},AbstractString})

_memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}) = _memcmp(a, b, min(sizeof(a), sizeof(b)))
function _memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}, len::Int)
ccall(:memcmp, Cint, (Ptr{UInt8}, Ptr{UInt8}, Csize_t), a, b, len % Csize_t) % Int
end

function cmp(a::String, b::String)
al, bl = sizeof(a), sizeof(b)
c = _memcmp(a, b, min(al,bl))
c = _memcmp(a, b)
return c < 0 ? -1 : c > 0 ? +1 : cmp(al,bl)
end

function ==(a::String, b::String)
pointer_from_objref(a) == pointer_from_objref(b) && return true
al = sizeof(a)
return al == sizeof(b) && 0 == _memcmp(a, b, al)
end
==(a::String, b::String) = a===b

typemin(::Type{String}) = ""
typemin(::String) = typemin(String)
Expand Down Expand Up @@ -284,23 +284,25 @@ getindex(s::String, r::AbstractUnitRange{<:Integer}) = s[Int(first(r)):Int(last(
return ss
end

length(s::String) = length_continued(s, 1, ncodeunits(s), ncodeunits(s))
# nothrow because we know the start and end indices are valid
@assume_effects :nothrow length(s::String) = length_continued(s, 1, ncodeunits(s), ncodeunits(s))

@inline function length(s::String, i::Int, j::Int)
# effects needed because @inbounds
@assume_effects :consistent :effect_free @inline function length(s::String, i::Int, j::Int)
@boundscheck begin
0 < i ncodeunits(s)+1 || throw(BoundsError(s, i))
0 j < ncodeunits(s)+1 || throw(BoundsError(s, j))
end
j < i && return 0
@inbounds i, k = thisind(s, i), i
c = j - i + (i == k)
length_continued(s, i, j, c)
@inbounds length_continued(s, i, j, c)
end

@inline function length_continued(s::String, i::Int, n::Int, c::Int)
@assume_effects :terminates_locally @inline @propagate_inbounds function length_continued(s::String, i::Int, n::Int, c::Int)
i < n || return c
@inbounds b = codeunit(s, i)
@inbounds while true
b = codeunit(s, i)
while true
while true
(i += 1) n || return c
0xc0 b 0xf7 && break
Expand Down Expand Up @@ -328,6 +330,10 @@ isvalid(s::String, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i

isascii(s::String) = isascii(codeunits(s))

# don't assume effects for general integers since we cannot know their implementation
@assume_effects :foldable function repeat(c::Char, r::BitInteger)
@invoke repeat(c, r::Integer)
end
"""
repeat(c::AbstractChar, r::Integer) -> String
Expand All @@ -340,8 +346,8 @@ julia> repeat('A', 3)
"AAA"
```
"""
repeat(c::AbstractChar, r::Integer) = repeat(Char(c), r) # fallback
function repeat(c::Char, r::Integer)
function repeat(c::AbstractChar, r::Integer)
c = Char(c)
r == 0 && return ""
r < 0 && throw(ArgumentError("can't repeat a character $r times"))
u = bswap(reinterpret(UInt32, c))
Expand Down
34 changes: 26 additions & 8 deletions base/strings/substring.jl
Original file line number Diff line number Diff line change
Expand Up @@ -110,15 +110,12 @@ thisind(s::SubString{String}, i::Int) = _thisind_str(s, i)
nextind(s::SubString{String}, i::Int) = _nextind_str(s, i)

function ==(a::Union{String, SubString{String}}, b::Union{String, SubString{String}})
s = sizeof(a)
s == sizeof(b) && 0 == _memcmp(a, b, s)
sizeof(a) == sizeof(b) && _memcmp(a, b) == 0
end

function cmp(a::SubString{String}, b::SubString{String})
na = sizeof(a)
nb = sizeof(b)
c = _memcmp(a, b, min(na, nb))
return c < 0 ? -1 : c > 0 ? +1 : cmp(na, nb)
c = _memcmp(a, b)
return c < 0 ? -1 : c > 0 ? +1 : cmp(sizeof(a), sizeof(b))
end

# don't make unnecessary copies when passing substrings to C functions
Expand Down Expand Up @@ -209,19 +206,34 @@ end
return n
end

@inline function __unsafe_string!(out, s::Union{String, SubString{String}}, offs::Integer)
@assume_effects :nothrow @inline function __unsafe_string!(out, s::String, offs::Integer)
n = sizeof(s)
GC.@preserve s out unsafe_copyto!(pointer(out, offs), pointer(s), n)
return n
end

@inline function __unsafe_string!(out, s::Symbol, offs::Integer)
@inline function __unsafe_string!(out, s::SubString{String}, offs::Integer)
n = sizeof(s)
GC.@preserve s out unsafe_copyto!(pointer(out, offs), pointer(s), n)
return n
end

@assume_effects :nothrow @inline function __unsafe_string!(out, s::Symbol, offs::Integer)
n = sizeof(s)
GC.@preserve s out unsafe_copyto!(pointer(out, offs), unsafe_convert(Ptr{UInt8},s), n)
return n
end

# nothrow needed here because for v in a can't prove the indexing is inbounds.
@assume_effects :foldable :nothrow function string(a::Union{Char, String, Symbol}...)
_string(a...)
end

function string(a::Union{Char, String, SubString{String}, Symbol}...)
_string(a...)
end

function _string(a::Union{Char, String, SubString{String}, Symbol}...)
n = 0
for v in a
# 4 types is too many for automatic Union-splitting, so we split manually
Expand Down Expand Up @@ -250,6 +262,12 @@ function string(a::Union{Char, String, SubString{String}, Symbol}...)
return out
end

# don't assume effects for general integers since we cannot know their implementation
# not nothrow because r<0 throws
@assume_effects :foldable function repeat(s::String, r::BitInteger)
@invoke repeat(s, r::Integer)
end

function repeat(s::Union{String, SubString{String}}, r::Integer)
r < 0 && throw(ArgumentError("can't repeat a string $r times"))
r == 0 && return ""
Expand Down
3 changes: 2 additions & 1 deletion test/misc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1384,7 +1384,8 @@ end

# sanity check `@allocations` returns what we expect in some very simple cases
@test (@allocations "a") == 0
@test (@allocations "a" * "b") == 1
@test (@allocations "a" * "b") == 0 # constant propagation
@test (@allocations "a" * Base.inferencebarrier("b")) == 1
end

@testset "in_finalizer" begin
Expand Down
39 changes: 39 additions & 0 deletions test/strings/basic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1192,3 +1192,42 @@ end
return a
end |> Core.Compiler.is_foldable
end

@testset "String Effects" begin
for (f, Ts) in [(*, (String, String)),
(*, (Char, String)),
(*, (Char, Char)),
(string, (Symbol, String, Char)),
(==, (String, String)),
(cmp, (String, String)),
(==, (Symbol, Symbol)),
(cmp, (Symbol, Symbol)),
(String, (Symbol,)),
(length, (String,)),
(hash, (String,UInt)),
(hash, (Char,UInt)),]
e = Base.infer_effects(f, Ts)
@test Core.Compiler.is_foldable(e) || (f, Ts)
@test Core.Compiler.is_removable_if_unused(e) || (f, Ts)
end
for (f, Ts) in [(^, (String, Int)),
(^, (Char, Int)),
(codeunit, (String, Int)),
]
e = Base.infer_effects(f, Ts)
@test Core.Compiler.is_foldable(e) || (f, Ts)
@test !Core.Compiler.is_removable_if_unused(e) || (f, Ts)
end
# Substrings don't have any nice effects because the compiler can
# invent fake indices leading to out of bounds
for (f, Ts) in [(^, (SubString{String}, Int)),
(string, (String, SubString{String})),
(string, (Symbol, SubString{String})),
(hash, (SubString{String},UInt)),
]
e = Base.infer_effects(f, Ts)
@test !Core.Compiler.is_foldable(e) || (f, Ts)
@test !Core.Compiler.is_removable_if_unused(e) || (f, Ts)
end
@test_throws ArgumentError Symbol("a\0a")
end

3 comments on commit 56950e2

@vtjnash
Copy link
Member

@vtjnash vtjnash commented on 56950e2 Apr 4, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nanosoldier runbenchmarks("parse_json", vs="@20b8139ecbc5ae27a005675fcfd6dfd47b34a27c")

@nanosoldier
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your benchmark job has completed - possible performance regressions were detected. A full report can be found here.

@StefanKarpinski
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How do better string effects make things slower and take more memory?

Please sign in to comment.