Skip to content

Commit

Permalink
add eachsplit for iterative splitting (#39245)
Browse files Browse the repository at this point in the history
This moves the existing splitting implementation into an iterator
named `eachsplit` and changes the definition of `split(...)` to
`collect(eachsplit(...))`, plus a few edge cases.
  • Loading branch information
anaveragehuman authored Sep 8, 2021
1 parent ed3691f commit dba8a08
Show file tree
Hide file tree
Showing 12 changed files with 99 additions and 55 deletions.
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ New language features
* `@inline` and `@noinline` annotations can now be applied to a function callsite or block
to enforce the involved function calls to be (or not to be) inlined. ([#41312])
* The default behavior of observing `@inbounds` declarations is now an option via `auto` in `--check-bounds=yes|no|auto` ([#41551])
* New function `eachsplit(str)` for iteratively performing `split(str)`.

Language changes
----------------
Expand Down
2 changes: 1 addition & 1 deletion base/binaryplatforms.jl
Original file line number Diff line number Diff line change
Expand Up @@ -706,7 +706,7 @@ function Base.parse(::Type{Platform}, triplet::AbstractString; validate_strict::
libstdcxx_version = get_field(m, libstdcxx_version_mapping)
cxxstring_abi = get_field(m, cxxstring_abi_mapping)
function split_tags(tagstr)
tag_fields = filter(!isempty, split(tagstr, "-"))
tag_fields = split(tagstr, "-"; keepempty=false)
if isempty(tag_fields)
return Pair{String,String}[]
end
Expand Down
4 changes: 2 additions & 2 deletions base/cmd.jl
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ function addenv(cmd::Cmd, env::Dict; inherit::Bool = true)
merge!(new_env, ENV)
end
else
for (k, v) in split.(cmd.env, "=")
for (k, v) in eachsplit.(cmd.env, "=")
new_env[string(k)::String] = string(v)::String
end
end
Expand All @@ -284,7 +284,7 @@ function addenv(cmd::Cmd, pairs::Pair{<:AbstractString}...; inherit::Bool = true
end

function addenv(cmd::Cmd, env::Vector{<:AbstractString}; inherit::Bool = true)
return addenv(cmd, Dict(k => v for (k, v) in split.(env, "=")); inherit)
return addenv(cmd, Dict(k => v for (k, v) in eachsplit.(env, "=")); inherit)
end

(&)(left::AbstractCmd, right::AbstractCmd) = AndCmds(left, right)
Expand Down
1 change: 1 addition & 0 deletions base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -578,6 +578,7 @@ export
codeunits,
digits,
digits!,
eachsplit,
escape_string,
hex2bytes,
hex2bytes!,
Expand Down
4 changes: 2 additions & 2 deletions base/initdefs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ function init_depot_path()
if haskey(ENV, "JULIA_DEPOT_PATH")
str = ENV["JULIA_DEPOT_PATH"]
isempty(str) && return
for path in split(str, Sys.iswindows() ? ';' : ':')
for path in eachsplit(str, Sys.iswindows() ? ';' : ':')
if isempty(path)
append_default_depot_path!(DEPOT_PATH)
else
Expand Down Expand Up @@ -198,7 +198,7 @@ end
function parse_load_path(str::String)
envs = String[]
isempty(str) && return envs
for env in split(str, Sys.iswindows() ? ';' : ':')
for env in eachsplit(str, Sys.iswindows() ? ';' : ':')
if isempty(env)
for env′ in DEFAULT_LOAD_PATH
env′ in envs || push!(envs, env′)
Expand Down
9 changes: 5 additions & 4 deletions base/logging.jl
Original file line number Diff line number Diff line change
Expand Up @@ -674,10 +674,11 @@ function handle_message(logger::SimpleLogger, level::LogLevel, message, _module,
end
iob = IOContext(buf, stream)
levelstr = level == Warn ? "Warning" : string(level)
msglines = split(chomp(string(message)::String), '\n')
println(iob, "", levelstr, ": ", msglines[1])
for i in 2:length(msglines)
println(iob, "", msglines[i])
msglines = eachsplit(chomp(string(message)::String), '\n')
msg1, rest = Iterators.peel(msglines)
println(iob, "", levelstr, ": ", msg1)
for msg in rest
println(iob, "", msg)
end
for (key, val) in kwargs
key === :maxlog && continue
Expand Down
4 changes: 2 additions & 2 deletions base/mpfr.jl
Original file line number Diff line number Diff line change
Expand Up @@ -962,7 +962,7 @@ function string_mpfr(x::BigFloat, fmt::String)
end

function _prettify_bigfloat(s::String)::String
mantissa, exponent = split(s, 'e')
mantissa, exponent = eachsplit(s, 'e')
if !occursin('.', mantissa)
mantissa = string(mantissa, '.')
end
Expand All @@ -973,7 +973,7 @@ function _prettify_bigfloat(s::String)::String
expo = parse(Int, exponent)
if -5 < expo < 6
expo == 0 && return mantissa
int, frac = split(mantissa, '.')
int, frac = eachsplit(mantissa, '.')
if expo > 0
expo < length(frac) ?
string(int, frac[1:expo], '.', frac[expo+1:end]) :
Expand Down
4 changes: 2 additions & 2 deletions base/path.jl
Original file line number Diff line number Diff line change
Expand Up @@ -368,8 +368,8 @@ function normpath(path::String)
isabs = isabspath(path)
isdir = isdirpath(path)
drive, path = splitdrive(path)
parts = split(path, path_separator_re)
filter!(x->!isempty(x) && x!=".", parts)
parts = split(path, path_separator_re; keepempty=false)
filter!(!=("."), parts)
while true
clean = true
for j = 1:length(parts)-1
Expand Down
119 changes: 80 additions & 39 deletions base/strings/util.jl
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,83 @@ function rpad(
r == 0 ? string(s, p^q) : string(s, p^q, first(p, r))
end

"""
eachsplit(str::AbstractString, dlm; limit::Integer=0)
eachsplit(str::AbstractString; limit::Integer=0)
Split `str` on occurrences of the delimiter(s) `dlm` and return an iterator over the
substrings. `dlm` can be any of the formats allowed by [`findnext`](@ref)'s first argument
(i.e. as a string, regular expression or a function), or as a single character or collection
of characters.
If `dlm` is omitted, it defaults to [`isspace`](@ref).
The iterator will return a maximum of `limit` results if the keyword argument is supplied.
The default of `limit=0` implies no maximum.
See also [`split`](@ref).
# Examples
```jldoctest
julia> a = "Ma.rch"
"Ma.rch"
julia> collect(eachsplit(a, "."))
2-element Vector{SubString}:
"Ma"
"rch"
```
"""
function eachsplit end

# Forcing specialization on `splitter` improves performance (roughly 30% decrease in runtime)
# and prevents a major invalidation risk (1550 MethodInstances)
struct SplitIterator{S<:AbstractString,F}
str::S
splitter::F
limit::Int
keepempty::Bool
end

eltype(::Type{<:SplitIterator}) = SubString

IteratorSize(::Type{<:SplitIterator}) = SizeUnknown()

# i: the starting index of the substring to be extracted
# k: the starting index of the next substring to be extracted
# n: the number of splits returned so far; always less than iter.limit - 1 (1 for the rest)
function iterate(iter::SplitIterator, (i, k, n)=(firstindex(iter.str), firstindex(iter.str), 0))
i - 1 > ncodeunits(iter.str)::Int && return nothing
r = findnext(iter.splitter, iter.str, k)::Union{Nothing,Int,UnitRange{Int}}
while r !== nothing && n != iter.limit - 1 && first(r) <= ncodeunits(iter.str)
j, k = first(r), nextind(iter.str, last(r))::Int
k_ = k <= j ? nextind(iter.str, j) : k
if i < k
substr = @inbounds SubString(iter.str, i, prevind(iter.str, j)::Int)
(iter.keepempty || i < j) && return (substr, (k, k_, n + 1))
i = k
end
k = k_
r = findnext(iter.splitter, iter.str, k)::Union{Nothing,Int,UnitRange{Int}}
end
iter.keepempty || i <= ncodeunits(iter.str) || return nothing
@inbounds SubString(iter.str, i), (ncodeunits(iter.str) + 2, k, n + 1)
end

eachsplit(str::T, splitter; limit::Integer=0, keepempty::Bool=true) where {T<:AbstractString} =
SplitIterator(str, splitter, limit, keepempty)

eachsplit(str::T, splitter::Union{Tuple{Vararg{AbstractChar}},AbstractVector{<:AbstractChar},Set{<:AbstractChar}};
limit::Integer=0, keepempty=true) where {T<:AbstractString} =
eachsplit(str, in(splitter); limit, keepempty)

eachsplit(str::T, splitter::AbstractChar; limit::Integer=0, keepempty=true) where {T<:AbstractString} =
eachsplit(str, isequal(splitter); limit, keepempty)

# a bit oddball, but standard behavior in Perl, Ruby & Python:
eachsplit(str::AbstractString; limit::Integer=0, keepempty=false) =
eachsplit(str, isspace; limit, keepempty)

"""
split(str::AbstractString, dlm; limit::Integer=0, keepempty::Bool=true)
split(str::AbstractString; limit::Integer=0, keepempty::Bool=false)
Expand Down Expand Up @@ -412,52 +489,16 @@ julia> split(a, ".")
"rch"
```
"""
function split end

function split(str::T, splitter;
limit::Integer=0, keepempty::Bool=true) where {T<:AbstractString}
_split(str, splitter, limit, keepempty, T <: SubString ? T[] : SubString{T}[])
end
function split(str::T, splitter::Union{Tuple{Vararg{AbstractChar}},AbstractVector{<:AbstractChar},Set{<:AbstractChar}};
limit::Integer=0, keepempty::Bool=true) where {T<:AbstractString}
_split(str, in(splitter), limit, keepempty, T <: SubString ? T[] : SubString{T}[])
end
function split(str::T, splitter::AbstractChar;
limit::Integer=0, keepempty::Bool=true) where {T<:AbstractString}
_split(str, isequal(splitter), limit, keepempty, T <: SubString ? T[] : SubString{T}[])
end

function _split(str::AbstractString, splitter::F, limit::Integer, keepempty::Bool, strs::Vector) where F
# Forcing specialization on `splitter` improves performance (roughly 30% decrease in runtime)
# and prevents a major invalidation risk (1550 MethodInstances)
i = 1 # firstindex(str)
n = lastindex(str)::Int
r = findfirst(splitter,str)::Union{Nothing,Int,UnitRange{Int}}
if r !== nothing
j, k = first(r), nextind(str,last(r))::Int
while 0 < j <= n && length(strs) != limit-1
if i < k
if keepempty || i < j
push!(strs, @inbounds SubString(str,i,prevind(str,j)::Int))
end
i = k
end
(k <= j) && (k = nextind(str,j)::Int)
r = findnext(splitter,str,k)::Union{Nothing,Int,UnitRange{Int}}
r === nothing && break
j, k = first(r), nextind(str,last(r))::Int
end
end
if keepempty || i <= ncodeunits(str)::Int
push!(strs, @inbounds SubString(str,i))
end
return strs
itr = eachsplit(str, splitter; limit, keepempty)
collect(T <: SubString ? T : SubString{T}, itr)
end

# a bit oddball, but standard behavior in Perl, Ruby & Python:
split(str::AbstractString;
limit::Integer=0, keepempty::Bool=false) =
split(str, isspace; limit=limit, keepempty=keepempty)
split(str, isspace; limit, keepempty)

"""
rsplit(s::AbstractString; limit::Integer=0, keepempty::Bool=false)
Expand Down
2 changes: 1 addition & 1 deletion base/sysinfo.jl
Original file line number Diff line number Diff line change
Expand Up @@ -499,7 +499,7 @@ function which(program_name::String)
# If we have been given just a program name (not a relative or absolute
# path) then we should search `PATH` for it here:
pathsep = iswindows() ? ';' : ':'
path_dirs = abspath.(split(get(ENV, "PATH", ""), pathsep))
path_dirs = map(abspath, eachsplit(get(ENV, "PATH", ""), pathsep))

# On windows we always check the current directory as well
if iswindows()
Expand Down
2 changes: 1 addition & 1 deletion base/util.jl
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ function with_output_color(@nospecialize(f::Function), color::Union{Int, Symbol}
(bold ? disable_text_style[:bold] : "") *
get(disable_text_style, color, text_colors[:default])
first = true
for line in split(str, '\n')
for line in eachsplit(str, '\n')
first || print(buf, '\n')
first = false
isempty(line) && continue
Expand Down
2 changes: 1 addition & 1 deletion base/version.jl
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ const VERSION_REGEX = r"^
$"ix

function split_idents(s::AbstractString)
idents = split(s, '.')
idents = eachsplit(s, '.')
pidents = Union{UInt64,String}[occursin(r"^\d+$", ident) ? parse(UInt64, ident) : String(ident) for ident in idents]
return tuple(pidents...)::VerTuple
end
Expand Down

0 comments on commit dba8a08

Please sign in to comment.