Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

copyuntil(out::IO, in::IO, delim) #48273

Merged
merged 24 commits into from
Jul 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Build system changes

New library functions
---------------------
* `copyuntil(out, io, delim)` and `copyline(out, io)` copy data into an `out::IO` stream ([#48273]).

New library features
--------------------
Expand Down
2 changes: 2 additions & 0 deletions base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -857,6 +857,8 @@ export
readline,
readlines,
readuntil,
copyuntil,
copyline,
redirect_stdio,
redirect_stderr,
redirect_stdin,
Expand Down
163 changes: 125 additions & 38 deletions base/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -440,10 +440,10 @@ for f in (
end
read(io::AbstractPipe, byte::Type{UInt8}) = read(pipe_reader(io)::IO, byte)::UInt8
unsafe_read(io::AbstractPipe, p::Ptr{UInt8}, nb::UInt) = unsafe_read(pipe_reader(io)::IO, p, nb)
readuntil(io::AbstractPipe, arg::UInt8; kw...) = readuntil(pipe_reader(io)::IO, arg; kw...)
readuntil(io::AbstractPipe, arg::AbstractChar; kw...) = readuntil(pipe_reader(io)::IO, arg; kw...)
readuntil(io::AbstractPipe, arg::AbstractString; kw...) = readuntil(pipe_reader(io)::IO, arg; kw...)
readuntil(io::AbstractPipe, arg::AbstractVector; kw...) = readuntil(pipe_reader(io)::IO, arg; kw...)
copyuntil(out::IO, io::AbstractPipe, arg::UInt8; kw...) = copyuntil(out, pipe_reader(io)::IO, arg; kw...)
copyuntil(out::IO, io::AbstractPipe, arg::AbstractChar; kw...) = copyuntil(out, pipe_reader(io)::IO, arg; kw...)
copyuntil(out::IO, io::AbstractPipe, arg::AbstractString; kw...) = copyuntil(out, pipe_reader(io)::IO, arg; kw...)
copyuntil(out::IO, io::AbstractPipe, arg::AbstractVector; kw...) = copyuntil(out, pipe_reader(io)::IO, arg; kw...)
readuntil_vector!(io::AbstractPipe, target::AbstractVector, keep::Bool, out) = readuntil_vector!(pipe_reader(io)::IO, target, keep, out)
readbytes!(io::AbstractPipe, target::AbstractVector{UInt8}, n=length(target)) = readbytes!(pipe_reader(io)::IO, target, n)
peek(io::AbstractPipe, ::Type{T}) where {T} = peek(pipe_reader(io)::IO, T)::T
Expand Down Expand Up @@ -499,11 +499,15 @@ read!(filename::AbstractString, a) = open(io->read!(io, a), convert(String, file
readuntil(stream::IO, delim; keep::Bool = false)
readuntil(filename::AbstractString, delim; keep::Bool = false)

Read a string from an I/O stream or a file, up to the given delimiter.
Read a string from an I/O `stream` or a file, up to the given delimiter.
The delimiter can be a `UInt8`, `AbstractChar`, string, or vector.
Keyword argument `keep` controls whether the delimiter is included in the result.
The text is assumed to be encoded in UTF-8.

Return a `String` if `delim` is an `AbstractChar` or a string
or otherwise return a `Vector{typeof(delim)}`. See also [`copyuntil`](@ref)
to instead write in-place to another stream (which can be a preallocated [`IOBuffer`](@ref)).

# Examples
```jldoctest
julia> write("my_file.txt", "JuliaLang is a GitHub organization.\\nIt has many members.\\n");
Expand All @@ -517,7 +521,39 @@ julia> readuntil("my_file.txt", '.', keep = true)
julia> rm("my_file.txt")
```
"""
readuntil(filename::AbstractString, args...; kw...) = open(io->readuntil(io, args...; kw...), convert(String, filename)::String)
readuntil(filename::AbstractString, delim; kw...) = open(io->readuntil(io, delim; kw...), convert(String, filename)::String)
readuntil(stream::IO, delim::UInt8; kw...) = _unsafe_take!(copyuntil(IOBuffer(sizehint=70), stream, delim; kw...))
readuntil(stream::IO, delim::Union{AbstractChar, AbstractString}; kw...) = String(_unsafe_take!(copyuntil(IOBuffer(sizehint=70), stream, delim; kw...)))


"""
copyuntil(out::IO, stream::IO, delim; keep::Bool = false)
copyuntil(out::IO, filename::AbstractString, delim; keep::Bool = false)

Copy a string from an I/O `stream` or a file, up to the given delimiter, to
the `out` stream, returning `out`.
The delimiter can be a `UInt8`, `AbstractChar`, string, or vector.
Keyword argument `keep` controls whether the delimiter is included in the result.
The text is assumed to be encoded in UTF-8.

Similar to [`readuntil`](@ref), which returns a `String`; in contrast,
`copyuntil` writes directly to `out`, without allocating a string.
(This can be used, for example, to read data into a pre-allocated [`IOBuffer`](@ref).)

# Examples
```jldoctest
julia> write("my_file.txt", "JuliaLang is a GitHub organization.\\nIt has many members.\\n");

julia> String(take!(copyuntil(IOBuffer(), "my_file.txt", 'L')))
"Julia"

julia> String(take!(copyuntil(IOBuffer(), "my_file.txt", '.', keep = true)))
"JuliaLang is a GitHub organization."

julia> rm("my_file.txt")
```
"""
copyuntil(out::IO, filename::AbstractString, delim; kw...) = open(io->copyuntil(out, io, delim; kw...), convert(String, filename)::String)

"""
readline(io::IO=stdin; keep::Bool=false)
Expand All @@ -530,6 +566,11 @@ false (as it is by default), these trailing newline characters are removed from
line before it is returned. When `keep` is true, they are returned as part of the
line.

Return a `String`. See also [`copyline`](@ref) to instead write in-place
to another stream (which can be a preallocated [`IOBuffer`](@ref)).

See also [`readuntil`](@ref) for reading until more general delimiters.

# Examples
```jldoctest
julia> write("my_file.txt", "JuliaLang is a GitHub organization.\\nIt has many members.\\n");
Expand All @@ -551,21 +592,63 @@ Logan
"Logan"
```
"""
function readline(filename::AbstractString; keep::Bool=false)
open(filename) do f
readline(f, keep=keep)
end
end
readline(filename::AbstractString; keep::Bool=false) =
open(io -> readline(io; keep), filename)
readline(s::IO=stdin; keep::Bool=false) =
String(_unsafe_take!(copyline(IOBuffer(sizehint=70), s; keep)))

function readline(s::IO=stdin; keep::Bool=false)
line = readuntil(s, 0x0a, keep=true)::Vector{UInt8}
i = length(line)
if keep || i == 0 || line[i] != 0x0a
return String(line)
elseif i < 2 || line[i-1] != 0x0d
return String(resize!(line,i-1))
"""
copyline(out::IO, io::IO=stdin; keep::Bool=false)
copyline(out::IO, filename::AbstractString; keep::Bool=false)

Copy a single line of text from an I/O `stream` or a file to the `out` stream,
returning `out`.

When reading from a file, the text is assumed to be encoded in UTF-8. Lines in the
input end with `'\\n'` or `"\\r\\n"` or the end of an input stream. When `keep` is
false (as it is by default), these trailing newline characters are removed from the
line before it is returned. When `keep` is true, they are returned as part of the
line.

Similar to [`readline`](@ref), which returns a `String`; in contrast,
`copyline` writes directly to `out`, without allocating a string.
(This can be used, for example, to read data into a pre-allocated [`IOBuffer`](@ref).)

See also [`copyuntil`](@ref) for reading until more general delimiters.

# Examples
```jldoctest
julia> write("my_file.txt", "JuliaLang is a GitHub organization.\\nIt has many members.\\n");

julia> String(take!(copyline(IOBuffer(), "my_file.txt")))
"JuliaLang is a GitHub organization."

julia> String(take!(copyline(IOBuffer(), "my_file.txt", keep=true)))
"JuliaLang is a GitHub organization.\\n"

julia> rm("my_file.txt")
```
"""
copyline(out::IO, filename::AbstractString; keep::Bool=false) =
open(io -> copyline(out, io; keep), filename)

# fallback to optimized methods for IOBuffer in iobuffer.jl
function copyline(out::IO, s::IO; keep::Bool=false)
if keep
return copyuntil(out, s, 0x0a, keep=true)
else
return String(resize!(line,i-2))
# more complicated to deal with CRLF logic
while !eof(s)
b = read(s, UInt8)
b == 0x0a && break
if b == 0x0d && !eof(s)
b = read(s, UInt8)
b == 0x0a && break
write(out, 0x0d)
end
write(out, b)
end
return out
end
end

Expand Down Expand Up @@ -816,36 +899,38 @@ end
# read(io, T) is not defined for other AbstractChar: implementations
# must provide their own encoding-specific method.

# readuntil_string is useful below since it has
# an optimized method for s::IOStream
readuntil_string(s::IO, delim::UInt8, keep::Bool) = String(readuntil(s, delim, keep=keep))::String

function readuntil(s::IO, delim::AbstractChar; keep::Bool=false)
function copyuntil(out::IO, s::IO, delim::AbstractChar; keep::Bool=false)
if delim ≤ '\x7f'
return readuntil_string(s, delim % UInt8, keep)
return copyuntil(out, s, delim % UInt8; keep)
end
out = IOBuffer()
for c in readeach(s, Char)
if c == delim
keep && write(out, c)
break
end
write(out, c)
end
return String(take!(out))
return out
end

function readuntil(s::IO, delim::T; keep::Bool=false) where T
out = (T === UInt8 ? StringVector(0) : Vector{T}())
# note: optimized methods of copyuntil for IOStreams and delim::UInt8 in iostream.jl
function _copyuntil(out, s::IO, delim::T, keep::Bool) where T
output! = isa(out, IO) ? write : push!
for c in readeach(s, T)
if c == delim
keep && push!(out, c)
keep && output!(out, c)
break
end
push!(out, c)
output!(out, c)
end
return out
end
readuntil(s::IO, delim::T; keep::Bool=false) where T =
_copyuntil(Vector{T}(), s, delim, keep)
readuntil(s::IO, delim::UInt8; keep::Bool=false) =
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This overwrites the definition on line 525, giving a warning during sysimage build.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a way to catch those warnings and turn them into errors on CI when building the sysimage? I guess we don't want this kind of situations.

Copy link
Member Author

@stevengj stevengj Jul 9, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will file a PR to fix this shortly. Filed #50485

_copyuntil(resize!(StringVector(70), 0), s, delim, keep)
copyuntil(out::IO, s::IO, delim::T; keep::Bool=false) where T =
_copyuntil(out, s, delim, keep)

# requires that indices for target are the integer unit range from firstindex to lastindex
# returns whether the delimiter was matched
Expand Down Expand Up @@ -933,27 +1018,29 @@ function readuntil_vector!(io::IO, target::AbstractVector{T}, keep::Bool, out) w
return false
end

function readuntil(io::IO, target::AbstractString; keep::Bool=false)
function copyuntil(out::IO, io::IO, target::AbstractString; keep::Bool=false)
# small-string target optimizations
x = Iterators.peel(target)
isnothing(x) && return ""
isnothing(x) && return out
c, rest = x
if isempty(rest) && c <= '\x7f'
return readuntil_string(io, c % UInt8, keep)
return copyuntil(out, io, c % UInt8; keep)
end
# convert String to a utf8-byte-iterator
if !(target isa String) && !(target isa SubString{String})
target = String(target)
end
target = codeunits(target)::AbstractVector
return String(readuntil(io, target, keep=keep))
return copyuntil(out, io, target, keep=keep)
end

function readuntil(io::IO, target::AbstractVector{T}; keep::Bool=false) where T
out = (T === UInt8 ? StringVector(0) : Vector{T}())
out = (T === UInt8 ? resize!(StringVector(70), 0) : Vector{T}())
readuntil_vector!(io, target, keep, out)
return out
end
copyuntil(out::IO, io::IO, target::AbstractVector; keep::Bool=false) =
(readuntil_vector!(io, target, keep, out); out)

"""
readchomp(x)
Expand Down Expand Up @@ -1128,7 +1215,7 @@ function iterate(r::Iterators.Reverse{<:EachLine}, state)
buf.size = _stripnewline(r.itr.keep, buf.size, buf.data)
empty!(chunks) # will cause next iteration to terminate
seekend(r.itr.stream) # reposition to end of stream for isdone
s = String(take!(buf))
s = String(_unsafe_take!(buf))
else
# extract the string from chunks[ichunk][inewline+1] to chunks[jchunk][jnewline]
if ichunk == jchunk # common case: current and previous newline in same chunk
Expand All @@ -1145,7 +1232,7 @@ function iterate(r::Iterators.Reverse{<:EachLine}, state)
end
write(buf, view(chunks[jchunk], 1:jnewline))
buf.size = _stripnewline(r.itr.keep, buf.size, buf.data)
s = String(take!(buf))
s = String(_unsafe_take!(buf))

# overwrite obsolete chunks (ichunk+1:jchunk)
i = jchunk
Expand Down
66 changes: 43 additions & 23 deletions base/iobuffer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -516,33 +516,53 @@ function occursin(delim::UInt8, buf::GenericIOBuffer)
return false
end

function readuntil(io::GenericIOBuffer, delim::UInt8; keep::Bool=false)
lb = 70
A = StringVector(lb)
nread = 0
nout = 0
data = io.data
for i = io.ptr : io.size
@inbounds b = data[i]
nread += 1
if keep || b != delim
nout += 1
if nout > lb
lb = nout*2
resize!(A, lb)
end
@inbounds A[nout] = b
end
if b == delim
break
end
function copyuntil(out::IO, io::GenericIOBuffer, delim::UInt8; keep::Bool=false)
data = view(io.data, io.ptr:io.size)
# note: findfirst + copyto! is much faster than a single loop
# except for nout ≲ 20. A single loop is 2x faster for nout=5.
nout = nread = something(findfirst(==(delim), data), length(data))
if !keep && nout > 0 && data[nout] == delim
nout -= 1
end
write(out, view(io.data, io.ptr:io.ptr+nout-1))
io.ptr += nread
if lb != nout
resize!(A, nout)
return out
end

function copyline(out::GenericIOBuffer, s::IO; keep::Bool=false)
copyuntil(out, s, 0x0a, keep=true)
line = out.data
i = out.size
if keep || i == 0 || line[i] != 0x0a
return out
elseif i < 2 || line[i-1] != 0x0d
i -= 1
else
i -= 2
end
A
out.size = i
if !out.append
out.ptr = i+1
end
return out
end

function _copyline(out::IO, io::GenericIOBuffer; keep::Bool=false)
data = view(io.data, io.ptr:io.size)
# note: findfirst + copyto! is much faster than a single loop
# except for nout ≲ 20. A single loop is 2x faster for nout=5.
nout = nread = something(findfirst(==(0x0a), data), length(data))
if !keep && nout > 0 && data[nout] == 0x0a
nout -= 1
nout > 0 && data[nout] == 0x0d && (nout -= 1)
end
write(out, view(io.data, io.ptr:io.ptr+nout-1))
io.ptr += nread
return out
end
copyline(out::IO, io::GenericIOBuffer; keep::Bool=false) = _copyline(out, io; keep)
copyline(out::GenericIOBuffer, io::GenericIOBuffer; keep::Bool=false) = _copyline(out, io; keep)


# copy-free crc32c of IOBuffer:
function _crc32c(io::IOBuffer, nb::Integer, crc::UInt32=0x00000000)
Expand Down
Loading