From 8a81cb35f6c16d56ee69441b0eaaf655e577f00f Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Sat, 7 Oct 2023 17:38:26 +0200 Subject: [PATCH 1/5] Implement StringPairs for efficient pairs(::AbstractString) The default `pairs` will iterate keys and values separately. For strings, this represents double work, since both these iterations will need to determine valid string indices. The introduced StringPairs type will, whenever possible, only compute valid indices once. Currently, this is only optimised for `String` and `SubString{String}`, and not for `AbstractString`, nor is it optimised when reversed. --- base/strings/util.jl | 35 +++++++++++++++++++++++++++++++++++ test/strings/util.jl | 11 +++++++++++ 2 files changed, 46 insertions(+) diff --git a/base/strings/util.jl b/base/strings/util.jl index a3f8e9a4773a6..12254dda08bf4 100644 --- a/base/strings/util.jl +++ b/base/strings/util.jl @@ -1102,3 +1102,38 @@ function Base.rest(s::AbstractString, st...) end return String(take!(io)) end + +""" + StringPairs{T}(x::AbstractString) + +This internal type is an iterator over (key => value) pairs of strings. +""" +struct StringPairs{T <: AbstractString} + x::T +end + +StringPairs(x) = StringPairs{typeof(x)}(x) +IteratorSize(::Type{StringPairs{T}}) where T = IteratorSize(T) +length(x::StringPairs) = length(x.x) +pairs(x::AbstractString) = StringPairs(x) + +# Generic fallback +function iterate(x::StringPairs, i=firstindex(x.x)) + i > ncodeunits(x.x) && return nothing + (i => x.x[i], nextind(x.x, i)) +end + +# In this method, exploit that string iteration's state is the index +function iterate( + x::StringPairs{<:Union{String, SubString{String}}}, + state::Int=firstindex(x.x) +) + (char, i) = @something iterate(x.x, state) return nothing + (state => char, i) +end + +# At this moment, Reverse{<:AbstractString} is inefficient, so this simple +# implementation is not easily optimised +function iterate(x::Iterators.Reverse{<:StringPairs}, i=lastindex(x.itr.x)) + i < firstindex(x.itr.x) ? nothing : (i => x.itr.x[i], prevind(x.itr.x, i)) +end diff --git a/test/strings/util.jl b/test/strings/util.jl index 59638dc3b9ca6..00a9b7e4fd924 100644 --- a/test/strings/util.jl +++ b/test/strings/util.jl @@ -724,3 +724,14 @@ end @test endswith(A, split(B, ' ')[end]) @test endswith(A, 'g') end + +@testset "pairs" begin + for s in ["", "a", "abcde", "γ", "∋γa"] + for T in (String, SubString, GenericString) + sT = T(s) + @test collect(pairs(sT)) == [k=>v for (k,v) in zip(keys(sT), sT)] + rv = Iterators.reverse(pairs(sT)) + @test collect(rv) == reverse([k=>v for (k,v) in zip(keys(sT), sT)]) + end + end +end From 8d2699ceee263103bbd032ff5a3378cc3867a460 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Sat, 28 Oct 2023 12:15:10 +0200 Subject: [PATCH 2/5] Generalize to IterableStatePairs --- base/iterators.jl | 36 +++++++++++++++++++++++++++++++++++- base/strings/string.jl | 2 +- base/strings/util.jl | 35 ----------------------------------- test/iterators.jl | 14 ++++++++++++++ test/strings/util.jl | 11 ----------- 5 files changed, 50 insertions(+), 48 deletions(-) diff --git a/base/iterators.jl b/base/iterators.jl index 24595d29ecc90..f3f68c4bee225 100644 --- a/base/iterators.jl +++ b/base/iterators.jl @@ -13,7 +13,7 @@ using .Base: SizeUnknown, HasLength, HasShape, IsInfinite, EltypeUnknown, HasEltype, OneTo, @propagate_inbounds, @isdefined, @boundscheck, @inbounds, Generator, AbstractRange, AbstractUnitRange, UnitRange, LinearIndices, TupleOrBottom, - (:), |, +, -, *, !==, !, ==, !=, <=, <, >, >=, missing, + (:), |, +, -, *, !==, !, ==, !=, <=, <, >, >=, =>, missing, any, _counttuple, eachindex, ntuple, zero, prod, reduce, in, firstindex, lastindex, tail, fieldtypes, min, max, minimum, zero, oneunit, promote, promote_shape using Core: @doc @@ -1573,4 +1573,38 @@ only(x::NamedTuple) = throw( ArgumentError("NamedTuple contains $(length(x)) elements, must contain exactly 1 element") ) +""" + IterableStatePairs(x) + +This internal type is returned by [`pairs`](@ref), when the key is the same as +the state of `iterate`. This allows the iterator to determine the key => value +pairs by only calling iterate on the values. + +""" +struct IterableStatePairs{T} + x::T +end + +IteratorSize(::Type{<:IterableStatePairs{T}}) where T = IteratorSize(T) +length(x::IterableStatePairs) = length(x.x) + +function iterate(x::IterableStatePairs, state=first(keys(x.x))) + it = iterate(x.x, state) + it === nothing && return nothing + (state => first(it), last(it)) +end + +reverse(x::IterableStatePairs) = IterableStatePairs(Iterators.reverse(x.x)) +reverse(x::IterableStatePairs{<:Iterators.Reverse}) = IterableStatePairs(x.x.itr) + +function iterate(x::IterableStatePairs{<:Iterators.Reverse}, state=last(keys(x.x.itr))) + it = iterate(x.x, state) + it === nothing && return nothing + (state => first(it), last(it)) +end + +# According to the docs of iterate(::AbstractString), the iteration state must +# be the same as the keys, so this is a valid optimization (see #51631) +pairs(s::AbstractString) = IterableStatePairs(s) + end diff --git a/base/strings/string.jl b/base/strings/string.jl index 9f5995c25c558..e121c58154d6e 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -412,7 +412,7 @@ is_valid_continuation(c) = c & 0xc0 == 0x80 b = @inbounds codeunit(s, i) u = UInt32(b) << 24 between(b, 0x80, 0xf7) || return reinterpret(Char, u), i+1 - return iterate_continued(s, i, u) + return @noinline iterate_continued(s, i, u) end # duck-type s so that external UTF-8 string packages like StringViews can hook in diff --git a/base/strings/util.jl b/base/strings/util.jl index 12254dda08bf4..a3f8e9a4773a6 100644 --- a/base/strings/util.jl +++ b/base/strings/util.jl @@ -1102,38 +1102,3 @@ function Base.rest(s::AbstractString, st...) end return String(take!(io)) end - -""" - StringPairs{T}(x::AbstractString) - -This internal type is an iterator over (key => value) pairs of strings. -""" -struct StringPairs{T <: AbstractString} - x::T -end - -StringPairs(x) = StringPairs{typeof(x)}(x) -IteratorSize(::Type{StringPairs{T}}) where T = IteratorSize(T) -length(x::StringPairs) = length(x.x) -pairs(x::AbstractString) = StringPairs(x) - -# Generic fallback -function iterate(x::StringPairs, i=firstindex(x.x)) - i > ncodeunits(x.x) && return nothing - (i => x.x[i], nextind(x.x, i)) -end - -# In this method, exploit that string iteration's state is the index -function iterate( - x::StringPairs{<:Union{String, SubString{String}}}, - state::Int=firstindex(x.x) -) - (char, i) = @something iterate(x.x, state) return nothing - (state => char, i) -end - -# At this moment, Reverse{<:AbstractString} is inefficient, so this simple -# implementation is not easily optimised -function iterate(x::Iterators.Reverse{<:StringPairs}, i=lastindex(x.itr.x)) - i < firstindex(x.itr.x) ? nothing : (i => x.itr.x[i], prevind(x.itr.x, i)) -end diff --git a/test/iterators.jl b/test/iterators.jl index 46e7c8b454335..b1c27ec2968a5 100644 --- a/test/iterators.jl +++ b/test/iterators.jl @@ -1009,3 +1009,17 @@ end @testset "collect partition substring" begin @test collect(Iterators.partition(lstrip("01111", '0'), 2)) == ["11", "11"] end + +@testset "IterableStringPairs" begin + for s in ["", "a", "abcde", "γ", "∋γa"] + for T in (String, SubString, GenericString) + sT = T(s) + p = pairs(sT) + @test collect(p) == [k=>v for (k,v) in zip(keys(sT), sT)] + rv = Iterators.reverse(p) + @test collect(rv) == reverse([k=>v for (k,v) in zip(keys(sT), sT)]) + rrv = Iterators.reverse(rv) + @test collect(rrv) == collect(p) + end + end +end diff --git a/test/strings/util.jl b/test/strings/util.jl index 00a9b7e4fd924..59638dc3b9ca6 100644 --- a/test/strings/util.jl +++ b/test/strings/util.jl @@ -724,14 +724,3 @@ end @test endswith(A, split(B, ' ')[end]) @test endswith(A, 'g') end - -@testset "pairs" begin - for s in ["", "a", "abcde", "γ", "∋γa"] - for T in (String, SubString, GenericString) - sT = T(s) - @test collect(pairs(sT)) == [k=>v for (k,v) in zip(keys(sT), sT)] - rv = Iterators.reverse(pairs(sT)) - @test collect(rv) == reverse([k=>v for (k,v) in zip(keys(sT), sT)]) - end - end -end From 1b273e501e580a40ecec07b930ee6df330313c4e Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Sat, 25 May 2024 10:56:16 +0200 Subject: [PATCH 3/5] Refine iterator type --- base/iterators.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/base/iterators.jl b/base/iterators.jl index af175ca9d6baa..658ce4d853906 100644 --- a/base/iterators.jl +++ b/base/iterators.jl @@ -1586,6 +1586,7 @@ end IteratorSize(::Type{<:IterableStatePairs{T}}) where T = IteratorSize(T) length(x::IterableStatePairs) = length(x.x) +Base.eltype(::Type{Base.Iterators.IterableStatePairs{T}}) where T = Tuple{Any, eltype(T)} function iterate(x::IterableStatePairs, state=first(keys(x.x))) it = iterate(x.x, state) From 9ca5509649e91180c6b5dbba353c3f7cb0a25edf Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Sat, 25 May 2024 11:26:16 +0200 Subject: [PATCH 4/5] Style nit --- base/iterators.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/iterators.jl b/base/iterators.jl index 658ce4d853906..4ae890697e08f 100644 --- a/base/iterators.jl +++ b/base/iterators.jl @@ -1586,7 +1586,7 @@ end IteratorSize(::Type{<:IterableStatePairs{T}}) where T = IteratorSize(T) length(x::IterableStatePairs) = length(x.x) -Base.eltype(::Type{Base.Iterators.IterableStatePairs{T}}) where T = Tuple{Any, eltype(T)} +Base.eltype(::Type{IterableStatePairs{T}}) where T = Tuple{Any, eltype(T)} function iterate(x::IterableStatePairs, state=first(keys(x.x))) it = iterate(x.x, state) From 1d30dcc9990544b2e867daeb05fffffc4abad9bd Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Sat, 25 May 2024 11:35:42 +0200 Subject: [PATCH 5/5] Refine type --- base/iterators.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/iterators.jl b/base/iterators.jl index 4ae890697e08f..fd410c4ab17c1 100644 --- a/base/iterators.jl +++ b/base/iterators.jl @@ -1586,7 +1586,7 @@ end IteratorSize(::Type{<:IterableStatePairs{T}}) where T = IteratorSize(T) length(x::IterableStatePairs) = length(x.x) -Base.eltype(::Type{IterableStatePairs{T}}) where T = Tuple{Any, eltype(T)} +Base.eltype(::Type{IterableStatePairs{T}}) where T = Pair{<:Any, eltype(T)} function iterate(x::IterableStatePairs, state=first(keys(x.x))) it = iterate(x.x, state)