Skip to content

Commit

Permalink
readuntil() for any delim type
Browse files Browse the repository at this point in the history
returns a string for Char delim, otherwise an array
readuntil(s, uint8('\n')) provides a way to read a line without an encoding check
streams are encouraged to provide an efficient readuntil(s, ::Uint8),
  then reading until ASCII delimiters is fast, leaving the encoding
  logic to higher-level layers in io.jl and string.jl.
for #1792
  • Loading branch information
JeffBezanson committed Dec 30, 2012
1 parent de27d83 commit 3434a2e
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 49 deletions.
24 changes: 20 additions & 4 deletions base/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,12 @@ function read(s::IO, ::Type{Char})
char(c)
end

function readuntil(s::IO, delim)
function readuntil(s::IO, delim::Char)
if delim < 0x80
data = readuntil(s, uint8(delim))
enc = byte_string_classify(data)
return (enc==1) ? ASCIIString(data) : UTF8String(data)
end
out = memio()
while !eof(s)
c = read(s, Char)
Expand All @@ -134,6 +139,18 @@ function readuntil(s::IO, delim)
takebuf_string(out)
end

function readuntil{T}(s::IO, delim::T)
out = T[]
while !eof(s)
c = read(s, T)
push(out, c)
if c == delim
break
end
end
out
end

readline(s::IO) = readuntil(s, '\n')

function readall(s::IO)
Expand Down Expand Up @@ -395,9 +412,8 @@ end

write(x) = write(OUTPUT_STREAM::IOStream, x)

function readuntil(s::IOStream, delim)
# TODO: faster versions that avoid the encoding check
ccall(:jl_readuntil, ByteString, (Ptr{Void}, Uint8), s.ios, delim)
function readuntil(s::IOStream, delim::Uint8)
ccall(:jl_readuntil, Array{Uint8,1}, (Ptr{Void}, Uint8), s.ios, delim)
end

function readall(s::IOStream)
Expand Down
5 changes: 2 additions & 3 deletions base/iostring.jl
Original file line number Diff line number Diff line change
Expand Up @@ -196,11 +196,10 @@ function memchr(buf::IOString, delim)
q = ccall(:memchr,Ptr{Uint8},(Ptr{Uint8},Int32,Int32),p,delim,nb_available(buf))
nb = (q == C_NULL ? 0 : q-p+1)
end
function readuntil(io::IOString, delim)
function readuntil(io::IOString, delim::Uint8)
nb = memchr(io, delim)
if nb == 0
nb = nb_available(io)
end
readbytes(io,nb)
read(io, Array(Uint8, nb))
end

5 changes: 3 additions & 2 deletions base/string.jl
Original file line number Diff line number Diff line change
Expand Up @@ -682,8 +682,9 @@ unescape_string(s::String) = sprint(length(s), print_unescaped, s)

## checking UTF-8 & ACSII validity ##

byte_string_classify(s::ByteString) =
ccall(:u8_isvalid, Int32, (Ptr{Uint8}, Int), s.data, length(s))
byte_string_classify(data::Array{Uint8,1}) =
ccall(:u8_isvalid, Int32, (Ptr{Uint8}, Int), data, length(data))
byte_string_classify(s::ByteString) = byte_string_classify(s.data)
# 0: neither valid ASCII nor UTF-8
# 1: valid ASCII
# 2: valid UTF-8
Expand Down
35 changes: 6 additions & 29 deletions extras/gzip.jl
Original file line number Diff line number Diff line change
Expand Up @@ -402,46 +402,23 @@ end
readall(s::GZipStream) = readall(s, Z_BIG_BUFSIZE)

# TODO: Create a c-wrapper based on gzreadline
function readuntil(s::GZipStream, delim)
if delim == '\n'
return readline(s)
else
buf = memio(GZ_LINE_BUFSIZE, false)
c = read(s, Char)
print(buf, c)
while c != delim && !eof(s)
try
c = read(s, Char)
print(buf, c)
catch e
if !isa(e, EOFError)
throw(e)
end
end
end
check_eof(s)
takebuf_string(buf)
end
end


function readline(s::GZipStream)
function readuntil(s::GZipStream, c::Uint8)
buf = Array(Uint8, GZ_LINE_BUFSIZE)
pos = 1

if gzgets(s, buf) == C_NULL # Throws an exception on error
return ""
return buf[1:0]
end

while(true)
# since gzgets didn't return C_NULL, there must be a \0 in the buffer
eos = memchr(buf, '\0', pos)
if eos == 1 || buf[eos-1] == '\n'
return bytestring(buf[1:eos-1])
if eos == 1 || buf[eos-1] == c
return buf[1:eos-1]
end

# If we're at the end of the file, return the string
if eof(s) return bytestring(buf[1:eos-1]) end
if eof(s) return buf[1:eos-1] end

# Otherwise, append to the end of the previous buffer

Expand All @@ -454,7 +431,7 @@ function readline(s::GZipStream)
if gzgets(s, pointer(buf)+pos-1, GZ_LINE_BUFSIZE) == C_NULL
# eof(s); remove extra buffer space
grow(buf, -GZ_LINE_BUFSIZE)
return bytestring(buf)
return buf
end
end
end
Expand Down
9 changes: 1 addition & 8 deletions src/sys.c
Original file line number Diff line number Diff line change
Expand Up @@ -310,14 +310,7 @@ jl_value_t *jl_readuntil(ios_t *s, uint8_t delim)
((char*)a->data)[n] = '\0';
}
}
JL_GC_PUSH(&a);
jl_struct_type_t* string_type = u8_isvalid(jl_array_data(a), jl_array_len(a)) == 1 ? // ASCII
jl_ascii_string_type : jl_utf8_string_type;
jl_value_t *str = alloc_2w();
str->type = (jl_type_t*)string_type;
jl_set_nth_field(str, 0, (jl_value_t*)a);
JL_GC_POP();
return str;
return (jl_value_t*)a;
}

void jl_free2(void *p, void *hint)
Expand Down
6 changes: 3 additions & 3 deletions test/perf2/perf2.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ srand(1776) # get more consistent times

require("$JULIA_HOME/../../examples/list.jl")

function listn1n2(n1::Int64,n2::Int64)
l1 = Nil{Int64}()
function listn1n2(n1::Int,n2::Int)
l1 = Nil{Int}()
for i=n2:-1:n1
l1 = Cons{Int64}(i,l1)
l1 = Cons{Int}(i,l1)
end
l1
end
Expand Down

0 comments on commit 3434a2e

Please sign in to comment.