From d78c6885119e20ecf34f54f1c841e4b51d90dac1 Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Sat, 20 Mar 2021 08:20:11 +0900 Subject: [PATCH 01/30] add findnext --- base/strings/search.jl | 64 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/base/strings/search.jl b/base/strings/search.jl index 51b23705671f7..4365d3ec36f39 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -301,6 +301,17 @@ julia> findnext("Lang", "JuliaLang", 2) """ findnext(t::AbstractString, s::AbstractString, start::Integer) = _search(s, t, Int(start)) +function findnext(a::Union{String,SubString{String}}, b::Union{String,SubString{String}}, start::Integer) + i = Int(start) + i > lastindex(b) + 1 && return nothing + i < firstindex(b) && throw(BoundsError(b, i)) + if i ≠ thisind(b, i) + i = nextind(b, i) + end + offset = search_forward(codeunits(a), codeunits(b), i - 1) + return offset ≥ 0 ? (offset+1:offset+lastindex(a)) : nothing +end + """ findnext(ch::AbstractChar, string::AbstractString, start::Integer) @@ -635,3 +646,56 @@ The returned function is of type `Base.Fix2{typeof(occursin)}`. occursin(haystack) = Base.Fix2(occursin, haystack) in(::AbstractString, ::AbstractString) = error("use occursin(x, y) for string containment") + +function search_forward(a::AbstractVector{T}, b::AbstractVector{T}, k::Int) where T <: Union{Int8,UInt8} + m = length(a) + n = length(b) - k + if m > n + return -1 + elseif m == 0 + return k + end + + # preprocess + a_end = a[end] + filter = bloom_filter_bit(a_end) + displacement = m + @inbounds for i in firstindex(a):lastindex(a)-1 + filter |= bloom_filter_bit(a[i]) + if a[i] == a_end + displacement = lastindex(a) - i + end + end + + # main loop + last = lastindex(b) + p = firstindex(b) + k + @inbounds while p + m - 1 ≤ last + if a_end == b[p+m-1] + # the last byte is matching + i = firstindex(a) + while i < lastindex(a) + a[i] == b[p+i-1] || break + i += 1 + end + if i == lastindex(a) + return p - firstindex(b) + elseif p + m ≤ last && !mayhave(filter, b[p+m]) + p += m + 1 + else + p += displacement + end + else + if p + m ≤ last && !mayhave(filter, b[p+m]) + p += m + 1 + else + p += 1 + end + end + end + return -1 +end + +# Bloom filter using a 64-bit integer +bloom_filter_bit(x::Union{Int8,UInt8}) = UInt64(1) << (x & 63) +mayhave(filter::UInt64, x::Union{Int8,UInt8}) = filter & bloom_filter_bit(x) ≠ 0 \ No newline at end of file From 68e8cdf058c1137656fb3fd9b7c2c4e0de4fd106 Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Sat, 20 Mar 2021 08:29:59 +0900 Subject: [PATCH 02/30] findprev --- base/strings/search.jl | 58 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/base/strings/search.jl b/base/strings/search.jl index 4365d3ec36f39..a0c3ddfb72f4d 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -566,6 +566,15 @@ julia> findprev("Julia", "JuliaLang", 6) """ findprev(t::AbstractString, s::AbstractString, i::Integer) = _rsearch(s, t, Int(i)) +function findprev(a::Union{String,SubString{String}}, b::Union{String,SubString{String}}, stop::Integer) + i = Int(stop) + i < firstindex(b) - 1 && return nothing + n = ncodeunits(b) + i > n && throw(BoundsError(b, i)) + offset = search_backward(codeunits(a), codeunits(b), n + 1 - nextind(b, i)) + return offset ≥ 0 ? (offset+1:offset+lastindex(a)) : nothing +end + """ findprev(ch::AbstractChar, string::AbstractString, start::Integer) @@ -696,6 +705,55 @@ function search_forward(a::AbstractVector{T}, b::AbstractVector{T}, k::Int) wher return -1 end +function search_backward(a::AbstractVector{T}, b::AbstractVector{T}, k::Int) where T <: Union{Int8,UInt8} + m = length(a) + n = length(b) - k + if m > n + return -1 + elseif m == 0 + return k + end + + # preprocess + a_begin = a[begin] + filter = bloom_filter_bit(a_begin) + displacement = m + @inbounds for i in lastindex(a):-1:firstindex(a)+1 + filter |= bloom_filter_bit(a[i]) + if a[i] == a_begin + displacement = firstindex(a) - i + end + end + + # main loop + first = firstindex(b) + p = lastindex(b) + 1 - (m + k) + @inbounds while p ≥ first + if a_begin == b[p] + # the first byte is matching + i = lastindex(a) + while i > firstindex(a) + a[i] == b[p+i-1] || break + i -= 1 + end + if i == firstindex(a) + return p - first + elseif p - 1 ≥ first && !mayhave(filter, b[p-1]) + p -= m + 1 + else + p -= displacement + end + else + if p - 1 ≥ first && !mayhave(filter, b[p-1]) + p -= m + 1 + else + p -= 1 + end + end + end + return -1 +end + # Bloom filter using a 64-bit integer bloom_filter_bit(x::Union{Int8,UInt8}) = UInt64(1) << (x & 63) mayhave(filter::UInt64, x::Union{Int8,UInt8}) = filter & bloom_filter_bit(x) ≠ 0 \ No newline at end of file From d9ad02e458d47ab834118e3a17453ee6f28a491d Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Sat, 20 Mar 2021 08:30:56 +0900 Subject: [PATCH 03/30] fix --- base/strings/search.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/strings/search.jl b/base/strings/search.jl index a0c3ddfb72f4d..e8c74361d40f5 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -303,7 +303,7 @@ findnext(t::AbstractString, s::AbstractString, start::Integer) = _search(s, t, I function findnext(a::Union{String,SubString{String}}, b::Union{String,SubString{String}}, start::Integer) i = Int(start) - i > lastindex(b) + 1 && return nothing + i > ncodeunits(b) + 1 && return nothing i < firstindex(b) && throw(BoundsError(b, i)) if i ≠ thisind(b, i) i = nextind(b, i) From 451392a49e2d230d034b28bc811e7ec7622df6e9 Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Sat, 20 Mar 2021 08:54:08 +0900 Subject: [PATCH 04/30] fix --- base/strings/search.jl | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/base/strings/search.jl b/base/strings/search.jl index e8c74361d40f5..a9a1e4c5269eb 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -303,8 +303,8 @@ findnext(t::AbstractString, s::AbstractString, start::Integer) = _search(s, t, I function findnext(a::Union{String,SubString{String}}, b::Union{String,SubString{String}}, start::Integer) i = Int(start) - i > ncodeunits(b) + 1 && return nothing i < firstindex(b) && throw(BoundsError(b, i)) + i > ncodeunits(b) + 1 && return nothing if i ≠ thisind(b, i) i = nextind(b, i) end @@ -353,8 +353,16 @@ julia> findnext([0x52, 0x62], [0x40, 0x52, 0x62, 0x52, 0x62], 3) """ findnext(pattern::AbstractVector{<:Union{Int8,UInt8}}, A::AbstractVector{<:Union{Int8,UInt8}}, - start::Integer) = - _search(A, pattern, start) + start::Integer) = _search(A, pattern, start) + +function findnext(a::AbstractVector{T}, b::AbstractVector{T}, start::Integer) where T <: Union{Int8,UInt8} + i = Int(start) + first = firstindex(b) + i < first && throw(BoundsError(b, i)) + i > lastindex(b) + 1 && return nothing + offset = search_forward(a, b, i - 1) + return offset ≥ 0 ? (offset+first:offset+first+lastindex(a)-1) : nothing +end """ findlast(pattern::AbstractString, string::AbstractString) @@ -613,8 +621,18 @@ julia> findprev([0x52, 0x62], [0x40, 0x52, 0x62, 0x52, 0x62], 3) """ findprev(pattern::AbstractVector{<:Union{Int8,UInt8}}, A::AbstractVector{<:Union{Int8,UInt8}}, - start::Integer) = - _rsearch(A, pattern, start) + start::Integer) = _rsearch(A, pattern, start) + +function findprev(a::AbstractVector{T}, b::AbstractVector{T}, stop::Integer) where T <: Union{Int8,UInt8} + i = Int(stop) + first = firstindex(b) + i < first - 1 && return nothing + i > lastindex(b) && throw(BoundsError(b, i)) + offset = search_backward(a, b, lastindex(b) - i) + return offset ≥ 0 ? (offset+first:offset+first+lastindex(a)-1) : nothing +end + + """ occursin(needle::Union{AbstractString,AbstractPattern,AbstractChar}, haystack::AbstractString) @@ -711,7 +729,7 @@ function search_backward(a::AbstractVector{T}, b::AbstractVector{T}, k::Int) whe if m > n return -1 elseif m == 0 - return k + return n end # preprocess From 18cf8c3ec7e61e5f1140a2962080efa84c7021c4 Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Sat, 20 Mar 2021 09:07:32 +0900 Subject: [PATCH 05/30] fix --- base/strings/search.jl | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/base/strings/search.jl b/base/strings/search.jl index a9a1e4c5269eb..16d853fe3c530 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -351,11 +351,7 @@ julia> findnext([0x52, 0x62], [0x40, 0x52, 0x62, 0x52, 0x62], 3) 4:5 ``` """ -findnext(pattern::AbstractVector{<:Union{Int8,UInt8}}, - A::AbstractVector{<:Union{Int8,UInt8}}, - start::Integer) = _search(A, pattern, start) - -function findnext(a::AbstractVector{T}, b::AbstractVector{T}, start::Integer) where T <: Union{Int8,UInt8} +function findnext(a::AbstractVector{<:Union{Int8,UInt8}}, b::AbstractVector{<:Union{Int8,UInt8}}, start::Integer) i = Int(start) first = firstindex(b) i < first && throw(BoundsError(b, i)) @@ -619,11 +615,7 @@ julia> findprev([0x52, 0x62], [0x40, 0x52, 0x62, 0x52, 0x62], 3) 2:3 ``` """ -findprev(pattern::AbstractVector{<:Union{Int8,UInt8}}, - A::AbstractVector{<:Union{Int8,UInt8}}, - start::Integer) = _rsearch(A, pattern, start) - -function findprev(a::AbstractVector{T}, b::AbstractVector{T}, stop::Integer) where T <: Union{Int8,UInt8} +function findprev(a::AbstractVector{<:Union{Int8,UInt8}}, b::AbstractVector{<:Union{Int8,UInt8}}, stop::Integer) i = Int(stop) first = firstindex(b) i < first - 1 && return nothing @@ -674,7 +666,7 @@ occursin(haystack) = Base.Fix2(occursin, haystack) in(::AbstractString, ::AbstractString) = error("use occursin(x, y) for string containment") -function search_forward(a::AbstractVector{T}, b::AbstractVector{T}, k::Int) where T <: Union{Int8,UInt8} +function search_forward(a::AbstractVector{<:Union{Int8,UInt8}}, b::AbstractVector{<:Union{Int8,UInt8}}, k::Int) m = length(a) n = length(b) - k if m > n @@ -723,7 +715,7 @@ function search_forward(a::AbstractVector{T}, b::AbstractVector{T}, k::Int) wher return -1 end -function search_backward(a::AbstractVector{T}, b::AbstractVector{T}, k::Int) where T <: Union{Int8,UInt8} +function search_backward(a::AbstractVector{<:Union{Int8,UInt8}}, b::AbstractVector{<:Union{Int8,UInt8}}, k::Int) m = length(a) n = length(b) - k if m > n From 04aea8570c76cf73dafc494f7d18ae5d46aa6e8e Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Sat, 20 Mar 2021 10:01:18 +0900 Subject: [PATCH 06/30] char --- base/strings/search.jl | 83 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 77 insertions(+), 6 deletions(-) diff --git a/base/strings/search.jl b/base/strings/search.jl index 16d853fe3c530..85d79c63adda0 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -1,7 +1,9 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license -nothing_sentinel(i) = i == 0 ? nothing : i +const Fix2Eq{T} = Fix2{<:Union{typeof(isequal),typeof(==)},T} +nothing_sentinel(i) = i == 0 ? nothing : i +#= function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar}, s::String, i::Integer) if i < 1 || i > sizeof(s) @@ -18,7 +20,7 @@ function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar} i = nextind(s, i) end end - +=# findfirst(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray) = nothing_sentinel(_search(a, pred.x)) @@ -329,8 +331,27 @@ julia> findnext('o', "Hello to the world", 6) 8 ``` """ -findnext(ch::AbstractChar, string::AbstractString, start::Integer) = - findnext(==(ch), string, start) +findnext(ch::AbstractChar, string::AbstractString, start::Integer) = findnext(==(ch), string, start) + +function findnext(p::Fix2Eq{<:AbstractChar}, b::Union{String,SubString{String}}, start::Integer) + i = Int(start) + first = firstindex(b) + i < first && throw(BoundsError(b, i)) + i > ncodeunits(b) && return nothing + if i ≠ thisind(b, i) + i = nextind(b, i) + end + probe = first_utf8_byte(p.x) + while true + offset = search_forward(probe, codeunits(b), i - 1) + if offset < 0 + return nothing + elseif p(b[offset+first]) + return offset + first + end + i = nextind(b, i) + end +end """ findnext(pattern::AbstractVector{<:Union{Int8,UInt8}}, @@ -596,8 +617,26 @@ julia> findprev('o', "Hello to the world", 18) 15 ``` """ -findprev(ch::AbstractChar, string::AbstractString, start::Integer) = - findprev(==(ch), string, start) +findprev(ch::AbstractChar, string::AbstractString, start::Integer) = findprev(==(ch), string, start) + +function findprev(p::Fix2Eq{<:AbstractChar}, b::Union{String,SubString{String}}, stop::Integer) + i = Int(stop) + first = firstindex(b) + i < first && return nothing + n = ncodeunits(b) + i > n && throw(BoundsError(b, i)) + i = thisind(b, i) + probe = first_utf8_byte(p.x) + while true + offset = search_backward(probe, codeunits(b), n - i) + if offset < 0 + return nothing + elseif p(b[offset+first]) + return offset + first + end + i = prevind(b, i) + end +end """ findprev(pattern::AbstractVector{<:Union{Int8,UInt8}}, @@ -666,6 +705,38 @@ occursin(haystack) = Base.Fix2(occursin, haystack) in(::AbstractString, ::AbstractString) = error("use occursin(x, y) for string containment") +function search_forward(a::Union{Int8,UInt8}, b::AbstractVector{<:Union{Int8,UInt8}}, k::Int) + typemin(eltype(b)) ≤ a ≤ typemax(eltype(b)) || return -1 + @inbounds for i in firstindex(b)+k:lastindex(b) + a == b[i] && return i - firstindex(b) + end + return -1 +end + +function search_forward(a::Union{Int8,UInt8}, b::Union{Vector{<:Union{Int8,UInt8}},CodeUnits{<:Union{Int8,UInt8},<:Union{String,SubString{String}}}}, k::Int) + typemin(eltype(b)) ≤ a ≤ typemax(eltype(b)) || return -1 + n = length(b) + n ≤ k && return -1 + p = GC.@preserve b ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Cint, Csize_t), pointer(b) + k, a, n - k) + return p ≠ C_NULL ? Int(p - pointer(b)) : -1 +end + +function search_backward(a::Union{Int8,UInt8}, b::AbstractVector{<:Union{Int8,UInt8}}, k::Int) + typemin(eltype(b)) ≤ a ≤ typemax(eltype(b)) || return -1 + @inbounds for i in lastindex(b)-k:-1:firstindex(b) + a == b[i] && return i - firstindex(b) + end + return -1 +end + +function search_backward(a::Union{Int8,UInt8}, b::Union{Vector{<:Union{Int8,UInt8}},CodeUnits{<:Union{Int8,UInt8},<:Union{String,SubString{String}}}}, k::Int) + typemin(eltype(b)) ≤ a ≤ typemax(eltype(b)) || return -1 + n = length(b) + n ≤ k && return -1 + p = GC.@preserve b ccall(:memrchr, Ptr{UInt8}, (Ptr{UInt8}, Cint, Csize_t), pointer(b), a, n - k) + return p ≠ C_NULL ? Int(p - pointer(b)) : -1 +end + function search_forward(a::AbstractVector{<:Union{Int8,UInt8}}, b::AbstractVector{<:Union{Int8,UInt8}}, k::Int) m = length(a) n = length(b) - k From f9f5065e197a89dc0306bcbeffe9e8251b631eec Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Sat, 20 Mar 2021 10:09:40 +0900 Subject: [PATCH 07/30] fix --- base/strings/search.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/base/strings/search.jl b/base/strings/search.jl index 85d79c63adda0..0da60db1c93a8 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -47,7 +47,7 @@ function _search(a::ByteArray, b::AbstractChar, i::Integer = 1) _search(a,unsafe_wrap(Vector{UInt8},string(b)),i).start end end - +#= function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar}, s::String, i::Integer) c = pred.x @@ -60,7 +60,7 @@ function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar} i = prevind(s, i) end end - +=# findlast(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray) = nothing_sentinel(_rsearch(a, pred.x)) From 348611b94b45093b218e74710b277007a0354cc2 Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Sat, 20 Mar 2021 10:31:54 +0900 Subject: [PATCH 08/30] fix --- base/strings/search.jl | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/base/strings/search.jl b/base/strings/search.jl index 0da60db1c93a8..ed163150e27cb 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -20,12 +20,13 @@ function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar} i = nextind(s, i) end end -=# + findfirst(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray) = nothing_sentinel(_search(a, pred.x)) findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray, i::Integer) = nothing_sentinel(_search(a, pred.x, i)) +=# function _search(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = 1) if i < 1 @@ -60,12 +61,12 @@ function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar} i = prevind(s, i) end end -=# findlast(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray) = nothing_sentinel(_rsearch(a, pred.x)) findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray, i::Integer) = nothing_sentinel(_rsearch(a, pred.x, i)) +=# function _rsearch(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = sizeof(a)) if i < 1 @@ -381,6 +382,17 @@ function findnext(a::AbstractVector{<:Union{Int8,UInt8}}, b::AbstractVector{<:Un return offset ≥ 0 ? (offset+first:offset+first+lastindex(a)-1) : nothing end +function findnext(p::Fix2Eq{<:Union{Int8,UInt8}}, b::AbstractVector{<:Union{Int8,UInt8}}, start::Integer) + i = Int(start) + first = firstindex(b) + i < first && throw(BoundsError(b, i)) + i > lastindex(b) && return nothing + offset = search_forward(p.x, b, i - 1) + return offset ≥ 0 ? offset + first : nothing +end + +findfirst(p::Fix2Eq{<:Union{Int8,UInt8}}, b::AbstractVector{<:Union{Int8,UInt8}}) = findnext(p, b, firstindex(b)) + """ findlast(pattern::AbstractString, string::AbstractString) @@ -663,6 +675,16 @@ function findprev(a::AbstractVector{<:Union{Int8,UInt8}}, b::AbstractVector{<:Un return offset ≥ 0 ? (offset+first:offset+first+lastindex(a)-1) : nothing end +function findprev(p::Fix2Eq{<:Union{Int8,UInt8}}, b::AbstractVector{<:Union{Int8,UInt8}}, stop::Integer) + i = Int(stop) + first = firstindex(b) + i < first && return nothing + i > lastindex(b) && throw(BoundsError(b, i)) + offset = search_backward(p.x, b, length(b) - i) + return offset ≥ 0 ? offset + first : nothing +end + +findlast(p::Fix2Eq{<:Union{Int8,UInt8}}, b::AbstractVector{<:Union{Int8,UInt8}}) = findprev(p, b, lastindex(b)) """ occursin(needle::Union{AbstractString,AbstractPattern,AbstractChar}, haystack::AbstractString) From 6e2911ac9d1028a32c8ecea605c58577e49e3a6b Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Sat, 20 Mar 2021 10:41:49 +0900 Subject: [PATCH 09/30] generic methods --- base/strings/search.jl | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/base/strings/search.jl b/base/strings/search.jl index ed163150e27cb..f432af4a748e7 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -146,20 +146,21 @@ findfirst(pattern::AbstractVector{<:Union{Int8,UInt8}}, _search(A, pattern, firstindex(A)) # AbstractString implementation of the generic findnext interface -function findnext(testf::Function, s::AbstractString, i::Integer) - i = Int(i) - z = ncodeunits(s) + 1 - 1 ≤ i ≤ z || throw(BoundsError(s, i)) - @inbounds i == z || isvalid(s, i) || string_index_err(s, i) - e = lastindex(s) - while i <= e - testf(@inbounds s[i]) && return i - i = @inbounds nextind(s, i) +function findnext(p::Function, b::AbstractString, start::Integer) + i = Int(start) + i < firstindex(b) && throw(BoundsError(b, i)) + last = lastindex(b) + i > last && return nothing + if i ≠ thisind(b, i) + i = nextind(b, i) + end + @inbounds while i ≤ last + p(b[i]) && return i + i = nextind(b, i) end return nothing end - in(c::AbstractChar, s::AbstractString) = (findfirst(isequal(c),s)!==nothing) function _searchindex(s::Union{AbstractString,ByteArray}, @@ -448,15 +449,15 @@ true findlast(ch::AbstractChar, string::AbstractString) = findlast(==(ch), string) # AbstractString implementation of the generic findprev interface -function findprev(testf::Function, s::AbstractString, i::Integer) - i = Int(i) - z = ncodeunits(s) + 1 - 0 ≤ i ≤ z || throw(BoundsError(s, i)) - i == z && return nothing - @inbounds i == 0 || isvalid(s, i) || string_index_err(s, i) - while i >= 1 - testf(@inbounds s[i]) && return i - i = @inbounds prevind(s, i) +function findprev(p::Function, b::AbstractString, stop::Integer) + i = Int(stop) + first = firstindex(b) + i < first && return nothing + i > lastindex(b) && throw(BoundsError(b, i)) + i = thisind(b, i) + @inbounds while i ≥ first + p(b[i]) && return i + i = prevind(s, i) end return nothing end From 9772daa84145176757033c5fa6249d2cfa36b27e Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Sat, 20 Mar 2021 11:04:34 +0900 Subject: [PATCH 10/30] generic string search --- base/strings/search.jl | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/base/strings/search.jl b/base/strings/search.jl index f432af4a748e7..b0476dc760be5 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -303,7 +303,25 @@ julia> findnext("Lang", "JuliaLang", 2) 6:9 ``` """ -findnext(t::AbstractString, s::AbstractString, start::Integer) = _search(s, t, Int(start)) +function findnext(a::AbstractString, b::AbstractString, start::Integer) + i = Int(start) + i < firstindex(b) && throw(BoundsError(b, i)) + i > ncodeunits(b) + 1 && return nothing + if i ≠ thisind(b, i) + i = nextind(b, i) + end + isempty(a) && return i:i-1 + a1 = first(a) + last = lastindex(b) + while i ≤ last + i = findnext(a1, b, i) + i === nothing && break + # TODO: If a and b are the same type, this is fine. But if not? + startswith(SubString(b, i, last), a) && return i:i+lastindex(a)-1 + i = nextind(b, i) + end + return nothing +end function findnext(a::Union{String,SubString{String}}, b::Union{String,SubString{String}}, start::Integer) i = Int(start) @@ -602,7 +620,22 @@ julia> findprev("Julia", "JuliaLang", 6) 1:5 ``` """ -findprev(t::AbstractString, s::AbstractString, i::Integer) = _rsearch(s, t, Int(i)) +function findprev(a::AbstractString, b::AbstractString, stop::Integer) + i = Int(stop) + first = firstindex(b) + i < first - 1 && return nothing + n = ncodeunits(b) + i > n && throw(BoundsError(b, i)) + isempty(a) && return i-1:i + while i ≥ first + i = findprev(isequal(a), b, i) + i === nothing && break + # TODO: If a and b are the same type, this is fine. But if not? + endswith(SubString(b, first, i), a) && return i-lastindex(a)+1:i + i = prevind(b, i) + end + return nothing +end function findprev(a::Union{String,SubString{String}}, b::Union{String,SubString{String}}, stop::Integer) i = Int(stop) From 2393c503bc52e10c672b32e87281c9ea77f28d64 Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Sat, 20 Mar 2021 11:12:29 +0900 Subject: [PATCH 11/30] do not use _search* --- base/strings/search.jl | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/base/strings/search.jl b/base/strings/search.jl index b0476dc760be5..7a308d4c9dde0 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -141,9 +141,8 @@ julia> findfirst([0x52, 0x62], [0x40, 0x52, 0x62, 0x63]) 2:3 ``` """ -findfirst(pattern::AbstractVector{<:Union{Int8,UInt8}}, - A::AbstractVector{<:Union{Int8,UInt8}}) = - _search(A, pattern, firstindex(A)) +findfirst(a::AbstractVector{<:Union{Int8,UInt8}}, + b::AbstractVector{<:Union{Int8,UInt8}}) = findnext(a, b, firstindex(b)) # AbstractString implementation of the generic findnext interface function findnext(p::Function, b::AbstractString, start::Integer) @@ -161,7 +160,7 @@ function findnext(p::Function, b::AbstractString, start::Integer) return nothing end -in(c::AbstractChar, s::AbstractString) = (findfirst(isequal(c),s)!==nothing) +in(c::AbstractChar, s::AbstractString) = findfirst(isequal(c), s) !== nothing function _searchindex(s::Union{AbstractString,ByteArray}, t::Union{AbstractString,AbstractChar,Int8,UInt8}, @@ -427,8 +426,7 @@ julia> findfirst("Julia", "JuliaLang") 1:5 ``` """ -findlast(pattern::AbstractString, string::AbstractString) = - findprev(pattern, string, lastindex(string)) +findlast(a::AbstractString, b::AbstractString) = findprev(a, b, ncodeunits(b)) """ findlast(pattern::AbstractVector{<:Union{Int8,UInt8}}, @@ -443,9 +441,8 @@ julia> findlast([0x52, 0x62], [0x52, 0x62, 0x52, 0x62]) 3:4 ``` """ -findlast(pattern::AbstractVector{<:Union{Int8,UInt8}}, - A::AbstractVector{<:Union{Int8,UInt8}}) = - findprev(pattern, A, lastindex(A)) +findlast(a::AbstractVector{<:Union{Int8,UInt8}}, + b::AbstractVector{<:Union{Int8,UInt8}}) = findprev(a, b, lastindex(b)) """ findlast(ch::AbstractChar, string::AbstractString) @@ -743,8 +740,7 @@ false See also: [`contains`](@ref). """ -occursin(needle::Union{AbstractString,AbstractChar}, haystack::AbstractString) = - _searchindex(haystack, needle, firstindex(haystack)) != 0 +occursin(a::Union{AbstractString,AbstractChar}, b::AbstractString) = findfirst(a, b) !== nothing """ occursin(haystack) From e9f2b1d62503e2730f0ccd940807514082dbc9c9 Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Sat, 20 Mar 2021 11:21:43 +0900 Subject: [PATCH 12/30] fix --- base/strings/search.jl | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/base/strings/search.jl b/base/strings/search.jl index 7a308d4c9dde0..4cabd210138e8 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -2,8 +2,9 @@ const Fix2Eq{T} = Fix2{<:Union{typeof(isequal),typeof(==)},T} -nothing_sentinel(i) = i == 0 ? nothing : i #= +nothing_sentinel(i) = i == 0 ? nothing : i + function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar}, s::String, i::Integer) if i < 1 || i > sizeof(s) @@ -26,7 +27,6 @@ findfirst(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray, i::Integer) = nothing_sentinel(_search(a, pred.x, i)) -=# function _search(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = 1) if i < 1 @@ -48,7 +48,7 @@ function _search(a::ByteArray, b::AbstractChar, i::Integer = 1) _search(a,unsafe_wrap(Vector{UInt8},string(b)),i).start end end -#= + function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar}, s::String, i::Integer) c = pred.x @@ -66,7 +66,6 @@ findlast(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a: findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray, i::Integer) = nothing_sentinel(_rsearch(a, pred.x, i)) -=# function _rsearch(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = sizeof(a)) if i < 1 @@ -88,6 +87,7 @@ function _rsearch(a::ByteArray, b::AbstractChar, i::Integer = length(a)) _rsearch(a,unsafe_wrap(Vector{UInt8},string(b)),i).start end end +=# """ findfirst(pattern::AbstractString, string::AbstractString) @@ -104,8 +104,7 @@ julia> findfirst("Julia", "JuliaLang") 1:5 ``` """ -findfirst(pattern::AbstractString, string::AbstractString) = - findnext(pattern, string, firstindex(string)) +findfirst(a::AbstractString, b::AbstractString) = findnext(a, b, firstindex(b)) """ findfirst(ch::AbstractChar, string::AbstractString) @@ -124,7 +123,7 @@ julia> findfirst('z', "happy") === nothing true ``` """ -findfirst(ch::AbstractChar, string::AbstractString) = findfirst(==(ch), string) +findfirst(a::AbstractChar, b::AbstractString) = findfirst(isequal(a), b) """ findfirst(pattern::AbstractVector{<:Union{Int8,UInt8}}, @@ -160,8 +159,9 @@ function findnext(p::Function, b::AbstractString, start::Integer) return nothing end -in(c::AbstractChar, s::AbstractString) = findfirst(isequal(c), s) !== nothing +in(a::AbstractChar, b::AbstractString) = findfirst(isequal(a), b) !== nothing +#= function _searchindex(s::Union{AbstractString,ByteArray}, t::Union{AbstractString,AbstractChar,Int8,UInt8}, i::Integer) @@ -275,6 +275,7 @@ function _search(s::Union{AbstractString,AbstractVector{<:Union{Int8,UInt8}}}, nothing end end +=# """ findnext(pattern::AbstractString, string::AbstractString, start::Integer) @@ -350,7 +351,7 @@ julia> findnext('o', "Hello to the world", 6) 8 ``` """ -findnext(ch::AbstractChar, string::AbstractString, start::Integer) = findnext(==(ch), string, start) +findnext(a::AbstractChar, b::AbstractString, start::Integer) = findnext(isequal(a), b, start) function findnext(p::Fix2Eq{<:AbstractChar}, b::Union{String,SubString{String}}, start::Integer) i = Int(start) @@ -461,7 +462,7 @@ julia> findlast('z', "happy") === nothing true ``` """ -findlast(ch::AbstractChar, string::AbstractString) = findlast(==(ch), string) +findlast(a::AbstractChar, b::AbstractString) = findlast(isequal(a), b) # AbstractString implementation of the generic findprev interface function findprev(p::Function, b::AbstractString, stop::Integer) @@ -477,6 +478,7 @@ function findprev(p::Function, b::AbstractString, stop::Integer) return nothing end +#= function _rsearchindex(s::AbstractString, t::Union{AbstractString,AbstractChar,Int8,UInt8}, i::Integer) @@ -593,6 +595,7 @@ function _rsearch(s::Union{AbstractString,AbstractVector{<:Union{Int8,UInt8}}}, nothing end end +=# """ findprev(pattern::AbstractString, string::AbstractString, start::Integer) @@ -660,7 +663,7 @@ julia> findprev('o', "Hello to the world", 18) 15 ``` """ -findprev(ch::AbstractChar, string::AbstractString, start::Integer) = findprev(==(ch), string, start) +findprev(a::AbstractChar, b::AbstractString, start::Integer) = findprev(isequal(a), b, start) function findprev(p::Fix2Eq{<:AbstractChar}, b::Union{String,SubString{String}}, stop::Integer) i = Int(stop) From 3bd5c562efb345e50b8b7fb29a6e8250c133a532 Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Sat, 20 Mar 2021 11:23:07 +0900 Subject: [PATCH 13/30] delete old code --- base/strings/search.jl | 322 ----------------------------------------- 1 file changed, 322 deletions(-) diff --git a/base/strings/search.jl b/base/strings/search.jl index 4cabd210138e8..4d1e4c0d5ff54 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -2,93 +2,6 @@ const Fix2Eq{T} = Fix2{<:Union{typeof(isequal),typeof(==)},T} -#= -nothing_sentinel(i) = i == 0 ? nothing : i - -function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar}, - s::String, i::Integer) - if i < 1 || i > sizeof(s) - i == sizeof(s) + 1 && return nothing - throw(BoundsError(s, i)) - end - @inbounds isvalid(s, i) || string_index_err(s, i) - c = pred.x - c ≤ '\x7f' && return nothing_sentinel(_search(s, c % UInt8, i)) - while true - i = _search(s, first_utf8_byte(c), i) - i == 0 && return nothing - pred(s[i]) && return i - i = nextind(s, i) - end -end - -findfirst(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray) = - nothing_sentinel(_search(a, pred.x)) - -findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray, i::Integer) = - nothing_sentinel(_search(a, pred.x, i)) - -function _search(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = 1) - if i < 1 - throw(BoundsError(a, i)) - end - n = sizeof(a) - if i > n - return i == n+1 ? 0 : throw(BoundsError(a, i)) - end - p = pointer(a) - q = GC.@preserve a ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-1, b, n-i+1) - return q == C_NULL ? 0 : Int(q-p+1) -end - -function _search(a::ByteArray, b::AbstractChar, i::Integer = 1) - if isascii(b) - _search(a,UInt8(b),i) - else - _search(a,unsafe_wrap(Vector{UInt8},string(b)),i).start - end -end - -function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar}, - s::String, i::Integer) - c = pred.x - c ≤ '\x7f' && return nothing_sentinel(_rsearch(s, c % UInt8, i)) - b = first_utf8_byte(c) - while true - i = _rsearch(s, b, i) - i == 0 && return nothing - pred(s[i]) && return i - i = prevind(s, i) - end -end -findlast(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray) = - nothing_sentinel(_rsearch(a, pred.x)) - -findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray, i::Integer) = - nothing_sentinel(_rsearch(a, pred.x, i)) - -function _rsearch(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = sizeof(a)) - if i < 1 - return i == 0 ? 0 : throw(BoundsError(a, i)) - end - n = sizeof(a) - if i > n - return i == n+1 ? 0 : throw(BoundsError(a, i)) - end - p = pointer(a) - q = GC.@preserve a ccall(:memrchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, b, i) - return q == C_NULL ? 0 : Int(q-p+1) -end - -function _rsearch(a::ByteArray, b::AbstractChar, i::Integer = length(a)) - if isascii(b) - _rsearch(a,UInt8(b),i) - else - _rsearch(a,unsafe_wrap(Vector{UInt8},string(b)),i).start - end -end -=# - """ findfirst(pattern::AbstractString, string::AbstractString) findfirst(pattern::AbstractPattern, string::String) @@ -161,122 +74,6 @@ end in(a::AbstractChar, b::AbstractString) = findfirst(isequal(a), b) !== nothing -#= -function _searchindex(s::Union{AbstractString,ByteArray}, - t::Union{AbstractString,AbstractChar,Int8,UInt8}, - i::Integer) - if isempty(t) - return 1 <= i <= nextind(s,lastindex(s))::Int ? i : - throw(BoundsError(s, i)) - end - t1, trest = Iterators.peel(t) - while true - i = findnext(isequal(t1),s,i) - if i === nothing return 0 end - ii = nextind(s, i)::Int - a = Iterators.Stateful(trest) - matched = all(splat(==), zip(SubString(s, ii), a)) - (isempty(a) && matched) && return i - i = ii - end -end - -_searchindex(s::AbstractString, t::AbstractChar, i::Integer) = something(findnext(isequal(t), s, i), 0) - -function _search_bloom_mask(c) - UInt64(1) << (c & 63) -end - -_nthbyte(s::String, i) = codeunit(s, i) -_nthbyte(t::AbstractVector, index) = t[index + (firstindex(t)-1)] - -function _searchindex(s::String, t::String, i::Integer) - # Check for fast case of a single byte - lastindex(t) == 1 && return something(findnext(isequal(t[1]), s, i), 0) - _searchindex(unsafe_wrap(Vector{UInt8},s), unsafe_wrap(Vector{UInt8},t), i) -end - -function _searchindex(s::AbstractVector{<:Union{Int8,UInt8}}, - t::AbstractVector{<:Union{Int8,UInt8}}, - _i::Integer) - sentinel = firstindex(s) - 1 - n = length(t) - m = length(s) - i = Int(_i) - sentinel - (i < 1 || i > m+1) && throw(BoundsError(s, _i)) - - if n == 0 - return 1 <= i <= m+1 ? max(1, i) : sentinel - elseif m == 0 - return sentinel - elseif n == 1 - return something(findnext(isequal(_nthbyte(t,1)), s, i), sentinel) - end - - w = m - n - if w < 0 || i - 1 > w - return sentinel - end - - bloom_mask = UInt64(0) - skip = n - 1 - tlast = _nthbyte(t,n) - for j in 1:n - bloom_mask |= _search_bloom_mask(_nthbyte(t,j)) - if _nthbyte(t,j) == tlast && j < n - skip = n - j - 1 - end - end - - i -= 1 - while i <= w - if _nthbyte(s,i+n) == tlast - # check candidate - j = 0 - while j < n - 1 - if _nthbyte(s,i+j+1) != _nthbyte(t,j+1) - break - end - j += 1 - end - - # match found - if j == n - 1 - # restore in case `s` is an OffSetArray - return i+firstindex(s) - end - - # no match, try to rule out the next character - if i < w && bloom_mask & _search_bloom_mask(_nthbyte(s,i+n+1)) == 0 - i += n - else - i += skip - end - elseif i < w - if bloom_mask & _search_bloom_mask(_nthbyte(s,i+n+1)) == 0 - i += n - end - end - i += 1 - end - - sentinel -end - -function _search(s::Union{AbstractString,AbstractVector{<:Union{Int8,UInt8}}}, - t::Union{AbstractString,AbstractChar,AbstractVector{<:Union{Int8,UInt8}}}, - i::Integer) - idx = _searchindex(s,t,i) - if isempty(t) - idx:idx-1 - elseif idx >= firstindex(s) - idx:(idx + lastindex(t) - 1) - else - nothing - end -end -=# - """ findnext(pattern::AbstractString, string::AbstractString, start::Integer) findnext(pattern::AbstractPattern, string::String, start::Integer) @@ -478,125 +275,6 @@ function findprev(p::Function, b::AbstractString, stop::Integer) return nothing end -#= -function _rsearchindex(s::AbstractString, - t::Union{AbstractString,AbstractChar,Int8,UInt8}, - i::Integer) - if isempty(t) - return 1 <= i <= nextind(s, lastindex(s))::Int ? i : - throw(BoundsError(s, i)) - end - t1, trest = Iterators.peel(Iterators.reverse(t)) - while true - i = findprev(isequal(t1), s, i) - i === nothing && return 0 - ii = prevind(s, i)::Int - a = Iterators.Stateful(trest) - b = Iterators.Stateful(Iterators.reverse( - pairs(SubString(s, 1, ii)))) - matched = all(splat(==), zip(a, (x[2] for x in b))) - if matched && isempty(a) - isempty(b) && return firstindex(s) - return nextind(s, popfirst!(b)[1])::Int - end - i = ii - end -end - -function _rsearchindex(s::String, t::String, i::Integer) - # Check for fast case of a single byte - if lastindex(t) == 1 - return something(findprev(isequal(t[1]), s, i), 0) - elseif lastindex(t) != 0 - j = i ≤ ncodeunits(s) ? nextind(s, i)-1 : i - return _rsearchindex(unsafe_wrap(Vector{UInt8}, s), unsafe_wrap(Vector{UInt8}, t), j) - elseif i > sizeof(s) - return 0 - elseif i == 0 - return 1 - else - return i - end -end - -function _rsearchindex(s::AbstractVector{<:Union{Int8,UInt8}}, t::AbstractVector{<:Union{Int8,UInt8}}, _k::Integer) - sentinel = firstindex(s) - 1 - n = length(t) - m = length(s) - k = Int(_k) - sentinel - k < 1 && throw(BoundsError(s, _k)) - - if n == 0 - return 0 <= k <= m ? max(k, 1) : sentinel - elseif m == 0 - return sentinel - elseif n == 1 - return something(findprev(isequal(_nthbyte(t,1)), s, k), sentinel) - end - - w = m - n - if w < 0 || k <= 0 - return sentinel - end - - bloom_mask = UInt64(0) - skip = n - 1 - tfirst = _nthbyte(t,1) - for j in n:-1:1 - bloom_mask |= _search_bloom_mask(_nthbyte(t,j)) - if _nthbyte(t,j) == tfirst && j > 1 - skip = j - 2 - end - end - - i = min(k - n + 1, w + 1) - while i > 0 - if _nthbyte(s,i) == tfirst - # check candidate - j = 1 - while j < n - if _nthbyte(s,i+j) != _nthbyte(t,j+1) - break - end - j += 1 - end - - # match found, restore in case `s` is an OffsetArray - if j == n - return i + sentinel - end - - # no match, try to rule out the next character - if i > 1 && bloom_mask & _search_bloom_mask(_nthbyte(s,i-1)) == 0 - i -= n - else - i -= skip - end - elseif i > 1 - if bloom_mask & _search_bloom_mask(_nthbyte(s,i-1)) == 0 - i -= n - end - end - i -= 1 - end - - sentinel -end - -function _rsearch(s::Union{AbstractString,AbstractVector{<:Union{Int8,UInt8}}}, - t::Union{AbstractString,AbstractChar,AbstractVector{<:Union{Int8,UInt8}}}, - i::Integer) - idx = _rsearchindex(s,t,i) - if isempty(t) - idx:idx-1 - elseif idx > firstindex(s) - 1 - idx:(idx + lastindex(t) - 1) - else - nothing - end -end -=# - """ findprev(pattern::AbstractString, string::AbstractString, start::Integer) From 9f154efdb5ba4a85504b07e5e69ac74d2e80b07d Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Sat, 20 Mar 2021 11:27:48 +0900 Subject: [PATCH 14/30] relocate --- base/strings/search.jl | 60 +++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/base/strings/search.jl b/base/strings/search.jl index 4d1e4c0d5ff54..8c382c2ff5dfb 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -56,22 +56,6 @@ julia> findfirst([0x52, 0x62], [0x40, 0x52, 0x62, 0x63]) findfirst(a::AbstractVector{<:Union{Int8,UInt8}}, b::AbstractVector{<:Union{Int8,UInt8}}) = findnext(a, b, firstindex(b)) -# AbstractString implementation of the generic findnext interface -function findnext(p::Function, b::AbstractString, start::Integer) - i = Int(start) - i < firstindex(b) && throw(BoundsError(b, i)) - last = lastindex(b) - i > last && return nothing - if i ≠ thisind(b, i) - i = nextind(b, i) - end - @inbounds while i ≤ last - p(b[i]) && return i - i = nextind(b, i) - end - return nothing -end - in(a::AbstractChar, b::AbstractString) = findfirst(isequal(a), b) !== nothing """ @@ -150,6 +134,22 @@ julia> findnext('o', "Hello to the world", 6) """ findnext(a::AbstractChar, b::AbstractString, start::Integer) = findnext(isequal(a), b, start) +# AbstractString implementation of the generic findnext interface +function findnext(p::Function, b::AbstractString, start::Integer) + i = Int(start) + i < firstindex(b) && throw(BoundsError(b, i)) + last = lastindex(b) + i > last && return nothing + if i ≠ thisind(b, i) + i = nextind(b, i) + end + @inbounds while i ≤ last + p(b[i]) && return i + i = nextind(b, i) + end + return nothing +end + function findnext(p::Fix2Eq{<:AbstractChar}, b::Union{String,SubString{String}}, start::Integer) i = Int(start) first = firstindex(b) @@ -261,20 +261,6 @@ true """ findlast(a::AbstractChar, b::AbstractString) = findlast(isequal(a), b) -# AbstractString implementation of the generic findprev interface -function findprev(p::Function, b::AbstractString, stop::Integer) - i = Int(stop) - first = firstindex(b) - i < first && return nothing - i > lastindex(b) && throw(BoundsError(b, i)) - i = thisind(b, i) - @inbounds while i ≥ first - p(b[i]) && return i - i = prevind(s, i) - end - return nothing -end - """ findprev(pattern::AbstractString, string::AbstractString, start::Integer) @@ -343,6 +329,20 @@ julia> findprev('o', "Hello to the world", 18) """ findprev(a::AbstractChar, b::AbstractString, start::Integer) = findprev(isequal(a), b, start) +# AbstractString implementation of the generic findprev interface +function findprev(p::Function, b::AbstractString, stop::Integer) + i = Int(stop) + first = firstindex(b) + i < first && return nothing + i > lastindex(b) && throw(BoundsError(b, i)) + i = thisind(b, i) + @inbounds while i ≥ first + p(b[i]) && return i + i = prevind(s, i) + end + return nothing +end + function findprev(p::Fix2Eq{<:AbstractChar}, b::Union{String,SubString{String}}, stop::Integer) i = Int(stop) first = firstindex(b) From e91af18247b3bea9317fcd57e894c0ce16739e06 Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Sat, 20 Mar 2021 11:34:11 +0900 Subject: [PATCH 15/30] move --- base/strings/search.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/base/strings/search.jl b/base/strings/search.jl index 8c382c2ff5dfb..19338f589ab88 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -56,8 +56,6 @@ julia> findfirst([0x52, 0x62], [0x40, 0x52, 0x62, 0x63]) findfirst(a::AbstractVector{<:Union{Int8,UInt8}}, b::AbstractVector{<:Union{Int8,UInt8}}) = findnext(a, b, firstindex(b)) -in(a::AbstractChar, b::AbstractString) = findfirst(isequal(a), b) !== nothing - """ findnext(pattern::AbstractString, string::AbstractString, start::Integer) findnext(pattern::AbstractPattern, string::String, start::Integer) @@ -436,6 +434,7 @@ The returned function is of type `Base.Fix2{typeof(occursin)}`. """ occursin(haystack) = Base.Fix2(occursin, haystack) +in(a::AbstractChar, b::AbstractString) = findfirst(isequal(a), b) !== nothing in(::AbstractString, ::AbstractString) = error("use occursin(x, y) for string containment") function search_forward(a::Union{Int8,UInt8}, b::AbstractVector{<:Union{Int8,UInt8}}, k::Int) From 7e530acd4129b37cc1dc6f6fa225627b3432ed8e Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Sat, 20 Mar 2021 12:06:09 +0900 Subject: [PATCH 16/30] comments --- base/strings/search.jl | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/base/strings/search.jl b/base/strings/search.jl index 19338f589ab88..28f4901be419f 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -132,7 +132,6 @@ julia> findnext('o', "Hello to the world", 6) """ findnext(a::AbstractChar, b::AbstractString, start::Integer) = findnext(isequal(a), b, start) -# AbstractString implementation of the generic findnext interface function findnext(p::Function, b::AbstractString, start::Integer) i = Int(start) i < firstindex(b) && throw(BoundsError(b, i)) @@ -327,7 +326,6 @@ julia> findprev('o', "Hello to the world", 18) """ findprev(a::AbstractChar, b::AbstractString, start::Integer) = findprev(isequal(a), b, start) -# AbstractString implementation of the generic findprev interface function findprev(p::Function, b::AbstractString, stop::Integer) i = Int(stop) first = firstindex(b) @@ -437,6 +435,17 @@ occursin(haystack) = Base.Fix2(occursin, haystack) in(a::AbstractChar, b::AbstractString) = findfirst(isequal(a), b) !== nothing in(::AbstractString, ::AbstractString) = error("use occursin(x, y) for string containment") +# Both search_forward and search_backward take three arguments: +# 1. needle (`a`) +# 2. haystack (`b`) +# 3. trimming (`k`) +# The search_forward funtion ignores `k` bytes from the head of `b` and the +# search_backward function ignores `k` bytes from the tail of `b`. `k` must be +# nonnegative, but `k` may be larger than the length of `b` (`a` matches +# nowhere in this case). These two functions return an offset if any match is +# found in `b`. That means the actual starting position of the match is `offset +# + firstindex(b)`. If there is no match, a negative integer is returned. + function search_forward(a::Union{Int8,UInt8}, b::AbstractVector{<:Union{Int8,UInt8}}, k::Int) typemin(eltype(b)) ≤ a ≤ typemax(eltype(b)) || return -1 @inbounds for i in firstindex(b)+k:lastindex(b) From 6f85bbed89e599f0d8777a3694f3fcf94bf53e15 Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Sat, 20 Mar 2021 12:12:41 +0900 Subject: [PATCH 17/30] fix --- base/strings/search.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/strings/search.jl b/base/strings/search.jl index 28f4901be419f..85ec67505d6b5 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -334,7 +334,7 @@ function findprev(p::Function, b::AbstractString, stop::Integer) i = thisind(b, i) @inbounds while i ≥ first p(b[i]) && return i - i = prevind(s, i) + i = prevind(b, i) end return nothing end From 0ea63281abdea81b6fa0d3039979271866ce0221 Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Sat, 20 Mar 2021 13:31:16 +0900 Subject: [PATCH 18/30] fix search_backward --- base/strings/search.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/base/strings/search.jl b/base/strings/search.jl index 85ec67505d6b5..b23c5eb19b8e0 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -497,6 +497,7 @@ function search_forward(a::AbstractVector{<:Union{Int8,UInt8}}, b::AbstractVecto displacement = lastindex(a) - i end end + @assert displacement > 0 # main loop last = lastindex(b) @@ -543,9 +544,10 @@ function search_backward(a::AbstractVector{<:Union{Int8,UInt8}}, b::AbstractVect @inbounds for i in lastindex(a):-1:firstindex(a)+1 filter |= bloom_filter_bit(a[i]) if a[i] == a_begin - displacement = firstindex(a) - i + displacement = i - firstindex(a) end end + @assert displacement > 0 # main loop first = firstindex(b) From 15c7eb6b9ff7560c469ebf05776a5980fd8cfd1b Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Sat, 20 Mar 2021 13:36:38 +0900 Subject: [PATCH 19/30] tweak tests --- test/strings/search.jl | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/test/strings/search.jl b/test/strings/search.jl index 5f52f8024cdd1..88f152644e1fe 100644 --- a/test/strings/search.jl +++ b/test/strings/search.jl @@ -15,7 +15,7 @@ end # sure that findnext/findprev are consistent # no matter what type of AbstractString the second argument is @test_throws BoundsError findnext(isequal('a'), "foo", 0) -@test_throws BoundsError findnext(in(Char[]), "foo", 5) +#@test_throws BoundsError findnext(in(Char[]), "foo", 5) # @test_throws BoundsError findprev(in(Char[]), "foo", 0) @test_throws BoundsError findprev(in(Char[]), "foo", 5) @@ -39,8 +39,8 @@ for str in [astr, GenericString(astr)] @test findnext(isequal(','), str, 7) == nothing @test findfirst(isequal('\n'), str) == 14 @test findnext(isequal('\n'), str, 15) == nothing - @test_throws BoundsError findnext(isequal('ε'), str, nextind(str,lastindex(str))+1) - @test_throws BoundsError findnext(isequal('a'), str, nextind(str,lastindex(str))+1) + @test findnext(isequal('ε'), str, nextind(str,lastindex(str))+1) === nothing + @test findnext(isequal('a'), str, nextind(str,lastindex(str))+1) === nothing end for str in [astr, GenericString(astr)] @@ -61,8 +61,8 @@ for str in [astr, GenericString(astr)] @test findnext(',', str, 7) == nothing @test findfirst('\n', str) == 14 @test findnext('\n', str, 15) == nothing - @test_throws BoundsError findnext('ε', str, nextind(str,lastindex(str))+1) - @test_throws BoundsError findnext('a', str, nextind(str,lastindex(str))+1) + @test findnext('ε', str, nextind(str,lastindex(str))+1) === nothing + @test findnext('a', str, nextind(str,lastindex(str))+1) === nothing end # ascii backward search @@ -109,16 +109,16 @@ for str in (u8str, GenericString(u8str)) @test findfirst(isequal('\u80'), str) == nothing @test findfirst(isequal('∄'), str) == nothing @test findfirst(isequal('∀'), str) == 1 - @test_throws StringIndexError findnext(isequal('∀'), str, 2) + @test findnext(isequal('∀'), str, 2) === nothing @test findnext(isequal('∀'), str, 4) == nothing @test findfirst(isequal('∃'), str) == 13 - @test_throws StringIndexError findnext(isequal('∃'), str, 15) + @test findnext(isequal('∃'), str, 15) === nothing @test findnext(isequal('∃'), str, 16) == nothing @test findfirst(isequal('x'), str) == 26 @test findnext(isequal('x'), str, 27) == 43 @test findnext(isequal('x'), str, 44) == nothing @test findfirst(isequal('δ'), str) == 17 - @test_throws StringIndexError findnext(isequal('δ'), str, 18) + @test findnext(isequal('δ'), str, 18) === 33 @test findnext(isequal('δ'), str, nextind(str,17)) == 33 @test findnext(isequal('δ'), str, nextind(str,33)) == nothing @test findfirst(isequal('ε'), str) == 5 @@ -126,8 +126,8 @@ for str in (u8str, GenericString(u8str)) @test findnext(isequal('ε'), str, nextind(str,54)) == nothing @test findnext(isequal('ε'), str, nextind(str,lastindex(str))) == nothing @test findnext(isequal('a'), str, nextind(str,lastindex(str))) == nothing - @test_throws BoundsError findnext(isequal('ε'), str, nextind(str,lastindex(str))+1) - @test_throws BoundsError findnext(isequal('a'), str, nextind(str,lastindex(str))+1) + @test findnext(isequal('ε'), str, nextind(str,lastindex(str))+1) === nothing + @test findnext(isequal('a'), str, nextind(str,lastindex(str))+1) === nothing end # utf-8 backward search @@ -316,13 +316,13 @@ end # string backward search with a two-char UTF-8 (2 byte) string literal @test findlast("éé", "éé") == 1:3 # should really be 1:4! -@test findprev("éé", "éé", lastindex("ééé")) == 1:3 +#@test findprev("éé", "éé", lastindex("ééé")) == 1:3 # string backward search with a two-char UTF-8 (3 byte) string literal @test findlast("€€", "€€") == 1:4 # should really be 1:6! -@test findprev("€€", "€€", lastindex("€€€")) == 1:4 +#@test findprev("€€", "€€", lastindex("€€€")) == 1:4 # string backward search with a two-char UTF-8 (4 byte) string literal @test findlast("\U1f596\U1f596", "\U1f596\U1f596") == 1:5 # should really be 1:8! -@test findprev("\U1f596\U1f596", "\U1f596\U1f596", lastindex("\U1f596\U1f596\U1f596")) == 1:5 +#@test findprev("\U1f596\U1f596", "\U1f596\U1f596", lastindex("\U1f596\U1f596\U1f596")) == 1:5 # string backward search with a two-char string literal @test findlast("xx", "foo,bar,baz") == nothing @@ -411,15 +411,15 @@ end # 1 idx too far is allowed @test findnext(pattern, A, length(A)+1) === nothing @test_throws BoundsError findnext(pattern, A, -3) - @test_throws BoundsError findnext(pattern, A, length(A)+2) + @test findnext(pattern, A, length(A)+2) === nothing @test findlast(pattern, A) === 4:5 @test findprev(pattern, A, 3) === 2:3 @test findprev(pattern, A, 5) === 4:5 @test findprev(pattern, A, 2) === nothing - @test findprev(pattern, A, length(A)+1) == findlast(pattern, A) - @test findprev(pattern, A, length(A)+2) == findlast(pattern, A) - @test_throws BoundsError findprev(pattern, A, -3) + #@test findprev(pattern, A, length(A)+1) == findlast(pattern, A) + #@test findprev(pattern, A, length(A)+2) == findlast(pattern, A) + @test findprev(pattern, A, -3) === nothing end end From 8825d12425bc5b8574400cde6d81e49e75062ca7 Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Sat, 20 Mar 2021 13:44:09 +0900 Subject: [PATCH 20/30] fix --- base/strings/util.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/base/strings/util.jl b/base/strings/util.jl index 140c5a31194f1..771af2d4e65a7 100644 --- a/base/strings/util.jl +++ b/base/strings/util.jl @@ -639,6 +639,8 @@ _firstbyteidx(s::String) = 1 _firstbyteidx(s::AbstractVector{UInt8}) = first(eachindex(s)) _lastbyteidx(s::String) = sizeof(s) _lastbyteidx(s::AbstractVector{UInt8}) = lastindex(s) +_nthbyte(s::String, i) = codeunit(s, i) +_nthbyte(s::AbstractVector{UInt8}, i) = s[i + firstindex(s) - 1] """ hex2bytes!(d::AbstractVector{UInt8}, s::Union{String,AbstractVector{UInt8}}) From 4d60a9af790d6b67dae862dccd8c48112df075af Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Sat, 20 Mar 2021 13:52:43 +0900 Subject: [PATCH 21/30] test --- test/strings/search.jl | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/test/strings/search.jl b/test/strings/search.jl index 88f152644e1fe..532afffc2cb3e 100644 --- a/test/strings/search.jl +++ b/test/strings/search.jl @@ -194,10 +194,7 @@ end @test findlast("∄", u8str) == nothing @test findlast("∀", u8str) == 1:1 @test findprev("∀", u8str, 0) == nothing -#TODO: setting the limit in the middle of a wide char -# makes findnext fail but findprev succeed. -# Should findprev fail as well? -#@test findprev("∀", u8str, 2) == nothing # gives 1:3 +@test findprev("∀", u8str, 2) == 1:1 @test findlast("∃", u8str) == 13:13 @test findprev("∃", u8str, 12) == nothing @test findlast("x", u8str) == 43:43 From c4ee4a8c74493084f23af49a2cacfdd5f021a54e Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Sat, 20 Mar 2021 14:03:19 +0900 Subject: [PATCH 22/30] test empty strings --- test/strings/search.jl | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/test/strings/search.jl b/test/strings/search.jl index 532afffc2cb3e..f0cbd2da356be 100644 --- a/test/strings/search.jl +++ b/test/strings/search.jl @@ -438,3 +438,16 @@ for T = (UInt, BigInt) @test findprev(isletter, astr, T(x)) isa Int end end + +@testset "empty strings" begin + a = "" + b = "abc" + for i in 1:4 + @test findnext(a, b, i) === i:i-1 + @test findprev(a, b, i-1) === i:i-1 + end + @test findnext(a, b, 5) === nothing + @test findprev(a, b, -1) === nothing + @test findfirst(a, b) === 1:0 + @test findlast(a, b) === 4:3 +end \ No newline at end of file From f19fa2454f4aad2c304a4ae68da272534838b433 Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Sat, 20 Mar 2021 14:58:00 +0900 Subject: [PATCH 23/30] ambiguity fix --- base/strings/search.jl | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/base/strings/search.jl b/base/strings/search.jl index b23c5eb19b8e0..9765eb82d41ee 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -1,6 +1,7 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license const Fix2Eq{T} = Fix2{<:Union{typeof(isequal),typeof(==)},T} +const ByteVector = Union{Vector{Int8},Vector{UInt8},CodeUnits{UInt8}} """ findfirst(pattern::AbstractString, string::AbstractString) @@ -195,7 +196,7 @@ function findnext(a::AbstractVector{<:Union{Int8,UInt8}}, b::AbstractVector{<:Un return offset ≥ 0 ? (offset+first:offset+first+lastindex(a)-1) : nothing end -function findnext(p::Fix2Eq{<:Union{Int8,UInt8}}, b::AbstractVector{<:Union{Int8,UInt8}}, start::Integer) +function findnext(p::Fix2Eq{<:Union{Int8,UInt8}}, b::ByteVector, start::Integer) i = Int(start) first = firstindex(b) i < first && throw(BoundsError(b, i)) @@ -204,7 +205,7 @@ function findnext(p::Fix2Eq{<:Union{Int8,UInt8}}, b::AbstractVector{<:Union{Int8 return offset ≥ 0 ? offset + first : nothing end -findfirst(p::Fix2Eq{<:Union{Int8,UInt8}}, b::AbstractVector{<:Union{Int8,UInt8}}) = findnext(p, b, firstindex(b)) +findfirst(p::Fix2Eq{<:Union{Int8,UInt8}}, b::ByteVector) = findnext(p, b, firstindex(b)) """ findlast(pattern::AbstractString, string::AbstractString) @@ -383,7 +384,7 @@ function findprev(a::AbstractVector{<:Union{Int8,UInt8}}, b::AbstractVector{<:Un return offset ≥ 0 ? (offset+first:offset+first+lastindex(a)-1) : nothing end -function findprev(p::Fix2Eq{<:Union{Int8,UInt8}}, b::AbstractVector{<:Union{Int8,UInt8}}, stop::Integer) +function findprev(p::Fix2Eq{<:Union{Int8,UInt8}}, b::ByteVector, stop::Integer) i = Int(stop) first = firstindex(b) i < first && return nothing @@ -392,7 +393,7 @@ function findprev(p::Fix2Eq{<:Union{Int8,UInt8}}, b::AbstractVector{<:Union{Int8 return offset ≥ 0 ? offset + first : nothing end -findlast(p::Fix2Eq{<:Union{Int8,UInt8}}, b::AbstractVector{<:Union{Int8,UInt8}}) = findprev(p, b, lastindex(b)) +findlast(p::Fix2Eq{<:Union{Int8,UInt8}}, b::ByteVector) = findprev(p, b, lastindex(b)) """ occursin(needle::Union{AbstractString,AbstractPattern,AbstractChar}, haystack::AbstractString) From df25ca0f8080d15d709aff884ad2df36042eacf2 Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Sun, 21 Mar 2021 01:47:41 +0900 Subject: [PATCH 24/30] fixme --- test/strings/search.jl | 33 ++++++++++++++++++--------------- test/strings/util.jl | 5 +++-- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/test/strings/search.jl b/test/strings/search.jl index f0cbd2da356be..ec5310942f2f8 100644 --- a/test/strings/search.jl +++ b/test/strings/search.jl @@ -4,23 +4,24 @@ astr = "Hello, world.\n" u8str = "∀ ε > 0, ∃ δ > 0: |x-y| < δ ⇒ |f(x)-f(y)| < ε" -# I think these should give error on 4 also, and "" is not treated -# consistently with SubString("",1,1), nor with Char[] -for ind in (0, 5) - @test_throws BoundsError findnext(SubString("",1,1), "foo", ind) - @test_throws BoundsError findprev(SubString("",1,1), "foo", ind) -end +# FIXME: The following indented test cases seem useless. What are they really testing? + # I think these should give error on 4 also, and "" is not treated + # consistently with SubString("",1,1), nor with Char[] + for ind in (0, 5) + @test_throws BoundsError findnext(SubString("",1,1), "foo", ind) + @test_throws BoundsError findprev(SubString("",1,1), "foo", ind) + end -# Note: the commented out test will be enabled after fixes to make -# sure that findnext/findprev are consistent -# no matter what type of AbstractString the second argument is -@test_throws BoundsError findnext(isequal('a'), "foo", 0) -#@test_throws BoundsError findnext(in(Char[]), "foo", 5) -# @test_throws BoundsError findprev(in(Char[]), "foo", 0) -@test_throws BoundsError findprev(in(Char[]), "foo", 5) + # Note: the commented out test will be enabled after fixes to make + # sure that findnext/findprev are consistent + # no matter what type of AbstractString the second argument is + @test_throws BoundsError findnext(isequal('a'), "foo", 0) + #@test_throws BoundsError findnext(in(Char[]), "foo", 5) + # @test_throws BoundsError findprev(in(Char[]), "foo", 0) + @test_throws BoundsError findprev(in(Char[]), "foo", 5) -# @test_throws ErrorException in("foobar","bar") -@test_throws BoundsError findnext(isequal(0x1),b"\x1\x2",0) + # @test_throws ErrorException in("foobar","bar") + @test_throws BoundsError findnext(isequal(0x1),b"\x1\x2",0) # ascii forward search for str in [astr, GenericString(astr)] @@ -313,6 +314,7 @@ end # string backward search with a two-char UTF-8 (2 byte) string literal @test findlast("éé", "éé") == 1:3 # should really be 1:4! +# FIXME: Throwing an BoundsError is consistent with findnext. Why this OoB indexing is allowed in findprev? #@test findprev("éé", "éé", lastindex("ééé")) == 1:3 # string backward search with a two-char UTF-8 (3 byte) string literal @test findlast("€€", "€€") == 1:4 # should really be 1:6! @@ -414,6 +416,7 @@ end @test findprev(pattern, A, 3) === 2:3 @test findprev(pattern, A, 5) === 4:5 @test findprev(pattern, A, 2) === nothing + # FIXME: The consistent behavior is to throw BoundsError but it's breaking. #@test findprev(pattern, A, length(A)+1) == findlast(pattern, A) #@test findprev(pattern, A, length(A)+2) == findlast(pattern, A) @test findprev(pattern, A, -3) === nothing diff --git a/test/strings/util.jl b/test/strings/util.jl index 617ff31106634..07718160eff7d 100644 --- a/test/strings/util.jl +++ b/test/strings/util.jl @@ -156,8 +156,9 @@ end # zero-width splits @test split("", "") == rsplit("", "") == [""] - @test split("abc", "") == rsplit("abc", "") == ["a","b","c"] - @test rsplit("abc", "", limit=2) == ["ab","c"] + # FIXME + #@test split("abc", "") == rsplit("abc", "") == ["a","b","c"] + #@test rsplit("abc", "", limit=2) == ["ab","c"] @test split("abc", "", limit=2) == ["a","bc"] @test split("", r"") == [""] From a4cf6b27b38139e8c90a87e8f288266e9e7a6af1 Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Sun, 21 Mar 2021 02:04:42 +0900 Subject: [PATCH 25/30] fix --- test/strings/search.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/strings/search.jl b/test/strings/search.jl index ec5310942f2f8..7504fc22323d0 100644 --- a/test/strings/search.jl +++ b/test/strings/search.jl @@ -453,4 +453,4 @@ end @test findprev(a, b, -1) === nothing @test findfirst(a, b) === 1:0 @test findlast(a, b) === 4:3 -end \ No newline at end of file +end From fe1a5e7548b09cdce1b615856cfc4054f5f2f6c3 Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Sun, 21 Mar 2021 13:32:59 +0900 Subject: [PATCH 26/30] Update base/strings/search.jl Co-authored-by: Shuhei Kadowaki <40514306+aviatesk@users.noreply.github.com> --- base/strings/search.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/base/strings/search.jl b/base/strings/search.jl index 9765eb82d41ee..9818bcfef9d23 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -520,7 +520,7 @@ function search_forward(a::AbstractVector{<:Union{Int8,UInt8}}, b::AbstractVecto end else if p + m ≤ last && !mayhave(filter, b[p+m]) - p += m + 1 + p += m + 1 else p += 1 end @@ -581,4 +581,4 @@ end # Bloom filter using a 64-bit integer bloom_filter_bit(x::Union{Int8,UInt8}) = UInt64(1) << (x & 63) -mayhave(filter::UInt64, x::Union{Int8,UInt8}) = filter & bloom_filter_bit(x) ≠ 0 \ No newline at end of file +mayhave(filter::UInt64, x::Union{Int8,UInt8}) = filter & bloom_filter_bit(x) ≠ 0 From d3a77775d45e8003e394b2e7ca9355d2a38aebf8 Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Sun, 21 Mar 2021 13:37:39 +0900 Subject: [PATCH 27/30] minor tweaks --- base/strings/search.jl | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/base/strings/search.jl b/base/strings/search.jl index 9818bcfef9d23..23e4adb3f83d1 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -507,8 +507,7 @@ function search_forward(a::AbstractVector{<:Union{Int8,UInt8}}, b::AbstractVecto if a_end == b[p+m-1] # the last byte is matching i = firstindex(a) - while i < lastindex(a) - a[i] == b[p+i-1] || break + while i < lastindex(a) && a[i] == b[p+i-1] i += 1 end if i == lastindex(a) @@ -557,8 +556,7 @@ function search_backward(a::AbstractVector{<:Union{Int8,UInt8}}, b::AbstractVect if a_begin == b[p] # the first byte is matching i = lastindex(a) - while i > firstindex(a) - a[i] == b[p+i-1] || break + while i > firstindex(a) && a[i] == b[p+i-1] i -= 1 end if i == firstindex(a) From 7f8154035ebdd634575746a9dd142aebdffa2c40 Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Sun, 21 Mar 2021 14:02:33 +0900 Subject: [PATCH 28/30] ad-hoc fix of rsplit --- base/strings/util.jl | 13 +++++++++++++ test/strings/util.jl | 5 ++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/base/strings/util.jl b/base/strings/util.jl index 771af2d4e65a7..e6aba9678577b 100644 --- a/base/strings/util.jl +++ b/base/strings/util.jl @@ -489,6 +489,19 @@ end function _rsplit(str::AbstractString, splitter, limit::Integer, keepempty::Bool, strs::Array) n = lastindex(str)::Int + if splitter isa AbstractString && isempty(splitter) + # TODO: Fix this ad-hoc handling of empty strings (see #40117). + if isempty(str) + keepempty && pushfirst!(strs, SubString(str, 1, 0)) + else + while n > 0 && length(strs) != limit - 1 + pushfirst!(strs, SubString(str, n, n)) + n = prevind(str, n)::Int + end + n > 0 && pushfirst!(strs, SubString(str, 1, n)) + end + return strs + end r = something(findlast(splitter, str)::Union{Nothing,Int,UnitRange{Int}}, 0) j, k = first(r), last(r) while j > 0 && k > 0 && length(strs) != limit-1 diff --git a/test/strings/util.jl b/test/strings/util.jl index 07718160eff7d..617ff31106634 100644 --- a/test/strings/util.jl +++ b/test/strings/util.jl @@ -156,9 +156,8 @@ end # zero-width splits @test split("", "") == rsplit("", "") == [""] - # FIXME - #@test split("abc", "") == rsplit("abc", "") == ["a","b","c"] - #@test rsplit("abc", "", limit=2) == ["ab","c"] + @test split("abc", "") == rsplit("abc", "") == ["a","b","c"] + @test rsplit("abc", "", limit=2) == ["ab","c"] @test split("abc", "", limit=2) == ["a","bc"] @test split("", r"") == [""] From e83bc67539dc417e80e1a9702e5d0f14294cb0b1 Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Thu, 25 Mar 2021 08:44:56 +0900 Subject: [PATCH 29/30] add tests on #40006 --- test/strings/search.jl | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/test/strings/search.jl b/test/strings/search.jl index 7504fc22323d0..dae9e83dd61e2 100644 --- a/test/strings/search.jl +++ b/test/strings/search.jl @@ -454,3 +454,14 @@ end @test findfirst(a, b) === 1:0 @test findlast(a, b) === 4:3 end + +@testset "sign-sensitive find functions (#40006)" begin + @test findfirst(==(Int8(-1)), [0xff]) === nothing + @test findnext(==(Int8(-1)), [0xff], 1) === nothing + @test findlast(==(Int8(-1)), [0xff]) === nothing + @test findprev(==(Int8(-1)), [0xff], 1) === nothing + @test findfirst(Int8[-1], [0xff]) === nothing + @test findnext(Int8[-1], [0xff], 1) === nothing + @test findlast(Int8[-1], [0xff]) === nothing + @test findprev(Int8[-1], [0xff], 1) === nothing +end From 91eb23b9083189912551df921007a4faedd7c6fc Mon Sep 17 00:00:00 2001 From: Kenta Sato Date: Tue, 30 Mar 2021 10:19:34 +0900 Subject: [PATCH 30/30] add test for #40244 --- test/strings/util.jl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/strings/util.jl b/test/strings/util.jl index 617ff31106634..b98a524e998a6 100644 --- a/test/strings/util.jl +++ b/test/strings/util.jl @@ -102,6 +102,9 @@ end @test split(",", ',' ; keepempty=false) == [] @test split(",,", ','; keepempty=false) == [] + @test split("", "//") == [""] + @test split("", "///") == [""] + @test split("a b c") == ["a","b","c"] @test split("a b \t c\n") == ["a","b","c"] @test split("α β \u2009 γ\n") == ["α","β","γ"] @@ -128,6 +131,9 @@ end @test rsplit(",", ',' ; keepempty=false) == [] @test rsplit(",,", ','; keepempty=false) == [] + @test rsplit("", "//") == [""] + @test rsplit("", "///") == [""] + @test rsplit("a b c") == ["a","b","c"] @test rsplit("a b \t c\n") == ["a","b","c"]