Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

string overhaul #24999

Merged
merged 22 commits into from
Dec 14, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
e596847
string overhaul: new Char representation, revamped core string API
StefanKarpinski Dec 8, 2017
274cf84
LineEdit: use character syntax for wildcard character
StefanKarpinski Dec 8, 2017
4d8b90f
strings/string: use checkbounds helper in a few places
StefanKarpinski Dec 8, 2017
6e15be8
repeat(Char, Integer): update & simplify for new Char rep
StefanKarpinski Dec 8, 2017
5cd9263
replace UnicodeError with StringIndexError
StefanKarpinski Dec 9, 2017
37aff06
See also: use `[name](@ref)` cross reference links.
StefanKarpinski Dec 9, 2017
7eb0def
address Milan's review comments
StefanKarpinski Dec 9, 2017
fcae423
use stevengj's suggested `hash(Char)` definition
StefanKarpinski Dec 9, 2017
f65c90b
add ncodeunits to stdlib docs
StefanKarpinski Dec 10, 2017
100d819
strings: some formatting tweaks
StefanKarpinski Dec 10, 2017
166924c
delete bswap(::Char)
StefanKarpinski Dec 10, 2017
dcf9552
deprecate chr2ind and ind2chr
StefanKarpinski Dec 10, 2017
61d5003
move string search functions into strings/search.jl
StefanKarpinski Dec 11, 2017
c713dff
optimize the length(::String) method better
StefanKarpinski Dec 11, 2017
b8cd96e
make length(string, i, j) ≥ 0 for all i, j
StefanKarpinski Dec 11, 2017
f9e1acb
moar Unicode operators
StefanKarpinski Dec 11, 2017
a7face9
change some code point conversions to char comparisons
StefanKarpinski Dec 12, 2017
1f0c6fa
various String performance tweaks
StefanKarpinski Dec 11, 2017
6f10ca2
add various `@propagate_inbounds` annotations
StefanKarpinski Dec 12, 2017
937c3ad
isvalid: return false out of bounds instead of throwing
StefanKarpinski Dec 12, 2017
feb1f68
bounds checks on string length(s, i, j)
StefanKarpinski Dec 11, 2017
8de25f5
bounds check thisind, nextind and prevind as well
StefanKarpinski Dec 12, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 76 additions & 13 deletions base/char.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,58 @@
# This file is a part of Julia. License is MIT: https://julialang.org/license

convert(::Type{Char}, x::UInt32) = reinterpret(Char, x)
struct MalformedCharError <: Exception
char::Char
end
struct CodePointError <: Exception
code::Integer
end
@noinline malformed_char(c::Char) = throw(MalformedCharError(c))
@noinline code_point_err(u::UInt32) = throw(CodePointError(u))

function ismalformed(c::Char)
u = reinterpret(UInt32, c)
l1 = leading_ones(u) << 3
t0 = trailing_zeros(u) & 56
(l1 == 8) | (l1 + t0 > 32) |
(((u & 0x00c0c0c0) ⊻ 0x00808080) >> t0 != 0)
end

function convert(::Type{UInt32}, c::Char)
# TODO: use optimized inline LLVM
u = reinterpret(UInt32, c)
u < 0x80000000 && return reinterpret(UInt32, u >> 24)
l1 = leading_ones(u)
t0 = trailing_zeros(u) & 56
(l1 == 1) | (8l1 + t0 > 32) |
(((u & 0x00c0c0c0) ⊻ 0x00808080) >> t0 != 0) &&
malformed_char(c)::Union{}
u &= 0xffffffff >> l1
u >>= t0
(u & 0x0000007f >> 0) | (u & 0x00007f00 >> 2) |
(u & 0x007f0000 >> 4) | (u & 0x7f000000 >> 6)
end

function convert(::Type{Char}, u::UInt32)
u < 0x80 && return reinterpret(Char, u << 24)
u < 0x00200000 || code_point_err(u)::Union{}
c = ((u << 0) & 0x0000003f) | ((u << 2) & 0x00003f00) |
((u << 4) & 0x003f0000) | ((u << 6) & 0x3f000000)
c = u < 0x00000800 ? (c << 16) | 0xc0800000 :
u < 0x00010000 ? (c << 08) | 0xe0808000 :
(c << 00) | 0xf0808080
reinterpret(Char, c)
end

function convert(::Type{T}, c::Char) where T <: Union{Int8,UInt8}
i = reinterpret(Int32, c)
i ≥ 0 ? ((i >>> 24) % T) : T(UInt32(c))
end

function convert(::Type{Char}, b::Union{Int8,UInt8})
0 ≤ b ≤ 0x7f ? reinterpret(Char, (b % UInt32) << 24) : Char(UInt32(b))
end

convert(::Type{Char}, x::Number) = Char(UInt32(x))
convert(::Type{UInt32}, x::Char) = reinterpret(UInt32, x)
convert(::Type{T}, x::Char) where {T<:Number} = convert(T, UInt32(x))

rem(x::Char, ::Type{T}) where {T<:Number} = rem(UInt32(x), T)
Expand All @@ -29,19 +79,16 @@ done(c::Char, state) = state
isempty(c::Char) = false
in(x::Char, y::Char) = x == y

==(x::Char, y::Char) = UInt32(x) == UInt32(y)
isless(x::Char, y::Char) = UInt32(x) < UInt32(y)

const hashchar_seed = 0xd4d64234
hash(x::Char, h::UInt) = hash_uint64(((UInt64(x)+hashchar_seed)<<32) ⊻ UInt64(h))
==(x::Char, y::Char) = reinterpret(UInt32, x) == reinterpret(UInt32, y)
isless(x::Char, y::Char) = reinterpret(UInt32, x) < reinterpret(UInt32, y)
hash(x::Char, h::UInt) =
hash_uint64(((reinterpret(UInt32, x) + UInt64(0xd4d64234)) << 32) ⊻ UInt64(h))

-(x::Char, y::Char) = Int(x) - Int(y)
-(x::Char, y::Integer) = Char(Int32(x) - Int32(y))
+(x::Char, y::Integer) = Char(Int32(x) + Int32(y))
+(x::Integer, y::Char) = y + x

bswap(x::Char) = Char(bswap(UInt32(x)))

print(io::IO, c::Char) = (write(io, c); nothing)

const hex_chars = UInt8['0':'9';'a':'z']
Expand All @@ -66,21 +113,37 @@ function show(io::IO, c::Char)
end
if Unicode.isprint(c)
write(io, 0x27, c, 0x27)
else
elseif !ismalformed(c)
u = UInt32(c)
write(io, 0x27, 0x5c, c <= '\x7f' ? 0x78 : c <= '\uffff' ? 0x75 : 0x55)
d = max(2, 8 - (leading_zeros(u) >> 2))
while 0 < d
write(io, hex_chars[((u >> ((d -= 1) << 2)) & 0xf) + 1])
end
write(io, 0x27)
else # malformed
write(io, 0x27)
u = reinterpret(UInt32, c)
while true
a = hex_chars[((u >> 28) & 0xf) + 1]
b = hex_chars[((u >> 24) & 0xf) + 1]
write(io, 0x5c, 'x', a, b)
(u <<= 8) == 0 && break
end
write(io, 0x27)
end
return
end

function show(io::IO, ::MIME"text/plain", c::Char)
show(io, c)
u = UInt32(c)
print(io, ": ", Unicode.isascii(c) ? "ASCII/" : "", "Unicode U+", hex(u, u > 0xffff ? 6 : 4))
print(io, " (category ", Unicode.category_abbrev(c), ": ", Unicode.category_string(c), ")")
if !ismalformed(c)
u = UInt32(c)
print(io, ": ", Unicode.isascii(c) ? "ASCII/" : "", "Unicode U+", hex(u, u > 0xffff ? 6 : 4))
else
print(io, ": Malformed UTF-8")
end
abr = Unicode.category_abbrev(c)
str = Unicode.category_string(c)
print(io, " (category ", abr, ": ", str, ")")
end
4 changes: 4 additions & 0 deletions base/deprecated.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2992,6 +2992,10 @@ end
@deprecate_binding Complex64 ComplexF32
@deprecate_binding Complex128 ComplexF64

# PR #24999
@deprecate ind2chr(s::AbstractString, i::Integer) length(s, 1, i)
@deprecate chr2ind(s::AbstractString, n::Integer) nextind(s, 0, n)

# END 0.7 deprecations

# BEGIN 1.0 deprecations
Expand Down
4 changes: 1 addition & 3 deletions base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ export
NullException,
ParseError,
SystemError,
UnicodeError,
StringIndexError,

# Global constants and variables
ARGS,
Expand Down Expand Up @@ -716,7 +716,6 @@ export
bytes2hex,
chomp,
chop,
chr2ind,
codeunit,
dec,
digits,
Expand All @@ -728,7 +727,6 @@ export
hex,
hex2bytes,
hex2bytes!,
ind2chr,
info,
ismatch,
isvalid,
Expand Down
20 changes: 20 additions & 0 deletions base/filesystem.jl
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,26 @@ function read(f::File, ::Type{UInt8})
return ret % UInt8
end

function read(f::File, ::Type{Char})
b0 = read(f, UInt8)
l = 8(4-leading_ones(b0))
c = UInt32(b0) << 24
if l < 24
s = 16
while s ≥ l && !eof(f)
p = position(f)
b = read(f, UInt8)
if b & 0xc0 != 0x80
seek(f, p)
break
end
c |= UInt32(b) << s
s -= 8
end
end
return reinterpret(Char, c)
end

function unsafe_read(f::File, p::Ptr{UInt8}, nel::UInt)
check_open(f)
ret = ccall(:jl_fs_read, Int32, (Int32, Ptr{Void}, Csize_t),
Expand Down
4 changes: 2 additions & 2 deletions base/intfuncs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -654,8 +654,8 @@ for sym in (:bin, :oct, :dec, :hex)
@eval begin
($sym)(x::Unsigned, p::Int) = ($sym)(x,p,false)
($sym)(x::Unsigned) = ($sym)(x,1,false)
($sym)(x::Char, p::Int) = ($sym)(unsigned(x),p,false)
($sym)(x::Char) = ($sym)(unsigned(x),1,false)
($sym)(x::Char, p::Int) = ($sym)(UInt32(x),p,false)
($sym)(x::Char) = ($sym)(UInt32(x),1,false)
($sym)(x::Integer, p::Int) = ($sym)(unsigned(abs(x)),p,x<0)
($sym)(x::Integer) = ($sym)(unsigned(abs(x)),1,x<0)
end
Expand Down
66 changes: 25 additions & 41 deletions base/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -535,25 +535,13 @@ function write(s::IO, a::SubArray{T,N,<:Array}) where {T,N}
end
end


function write(s::IO, ch::Char)
c = reinterpret(UInt32, ch)
if c < 0x80
return write(s, c%UInt8)
elseif c < 0x800
return (write(s, (( c >> 6 ) | 0xC0)%UInt8)) +
(write(s, (( c & 0x3F ) | 0x80)%UInt8))
elseif c < 0x10000
return (write(s, (( c >> 12 ) | 0xE0)%UInt8)) +
(write(s, (((c >> 6) & 0x3F ) | 0x80)%UInt8)) +
(write(s, (( c & 0x3F ) | 0x80)%UInt8))
elseif c < 0x110000
return (write(s, (( c >> 18 ) | 0xF0)%UInt8)) +
(write(s, (((c >> 12) & 0x3F ) | 0x80)%UInt8)) +
(write(s, (((c >> 6) & 0x3F ) | 0x80)%UInt8)) +
(write(s, (( c & 0x3F ) | 0x80)%UInt8))
else
return write(s, '\ufffd')
function write(io::IO, c::Char)
u = bswap(reinterpret(UInt32, c))
n = 1
while true
write(io, u % UInt8)
(u >>= 8) == 0 && return n
n += 1
end
end

Expand Down Expand Up @@ -596,31 +584,28 @@ function read!(s::IO, a::Array{T}) where T
return a
end

function read(s::IO, ::Type{Char})
ch = read(s, UInt8)
if ch < 0x80
return Char(ch)
end

# mimic utf8.next function
trailing = Base.utf8_trailing[ch+1]
c::UInt32 = 0
for j = 1:trailing
c += ch
c <<= 6
ch = read(s, UInt8)
function read(io::IO, ::Type{Char})
b0 = read(io, UInt8)
l = 8(4-leading_ones(b0))
c = UInt32(b0) << 24
if l < 24
s = 16
while s ≥ l && !eof(io)
peek(io) & 0xc0 == 0x80 || break
b = read(io, UInt8)
c |= UInt32(b) << s
s -= 8
end
end
c += ch
c -= Base.utf8_offset[trailing+1]
return Char(c)
return reinterpret(Char, c)
end

# readuntil_string is useful below since it has
# an optimized method for s::IOStream
readuntil_string(s::IO, delim::UInt8) = String(readuntil(s, delim))

function readuntil(s::IO, delim::Char)
if delim < Char(0x80)
if delim ≤ '\x7f'
return readuntil_string(s, delim % UInt8)
end
out = IOBuffer()
Expand Down Expand Up @@ -701,7 +686,7 @@ function readuntil(io::IO, target::AbstractString)
i = start(target)
done(target, i) && return ""
c, i = next(target, start(target))
if done(target, i) && c < Char(0x80)
if done(target, i) && c <= '\x7f'
return readuntil_string(io, c % UInt8)
end
# decide how we can index target
Expand All @@ -728,12 +713,11 @@ function readuntil(io::IO, target::AbstractVector{T}) where T
return out
end


"""
readchomp(x)

Read the entirety of `x` as a string and remove a single trailing newline.
Equivalent to `chomp!(read(x, String))`.
Read the entirety of `x` as a string and remove a single trailing newline
if there is one. Equivalent to `chomp(read(x, String))`.

# Examples
```jldoctest
Expand All @@ -747,7 +731,7 @@ julia> readchomp("my_file.txt")
julia> rm("my_file.txt");
```
"""
readchomp(x) = chomp!(read(x, String))
readchomp(x) = chomp(read(x, String))

# read up to nb bytes into nb, returning # bytes read

Expand Down
32 changes: 17 additions & 15 deletions base/iostream.jl
Original file line number Diff line number Diff line change
Expand Up @@ -315,12 +315,13 @@ end

## low-level calls ##

write(s::IOStream, b::UInt8) = Int(ccall(:ios_putc, Cint, (Cint, Ptr{Void}), b, s.ios))
function write(s::IOStream, b::UInt8)
iswritable(s) || throw(ArgumentError("write failed, IOStream is not writeable"))
Int(ccall(:ios_putc, Cint, (Cint, Ptr{Void}), b, s.ios))
end

function unsafe_write(s::IOStream, p::Ptr{UInt8}, nb::UInt)
if !iswritable(s)
throw(ArgumentError("write failed, IOStream is not writeable"))
end
iswritable(s) || throw(ArgumentError("write failed, IOStream is not writeable"))
return Int(ccall(:ios_write, Csize_t, (Ptr{Void}, Ptr{Void}, Csize_t), s.ios, p, nb))
end

Expand Down Expand Up @@ -353,14 +354,6 @@ end

## text I/O ##

function write(s::IOStream, c::Char)
if !iswritable(s)
throw(ArgumentError("write failed, IOStream is not writeable"))
end
Int(ccall(:ios_pututf8, Cint, (Ptr{Void}, UInt32), s.ios, c))
end
read(s::IOStream, ::Type{Char}) = Char(ccall(:jl_getutf8, UInt32, (Ptr{Void},), s.ios))

take!(s::IOStream) =
ccall(:jl_take_buffer, Vector{UInt8}, (Ptr{Void},), s.ios)

Expand Down Expand Up @@ -452,14 +445,23 @@ function read(s::IOStream, nb::Integer; all::Bool=true)
end

## Character streams ##
const _chtmp = Ref{Char}()

function peekchar(s::IOStream)
if ccall(:ios_peekutf8, Cint, (Ptr{Void}, Ptr{Char}), s, _chtmp) < 0
chref = Ref{UInt32}()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't this hurt performance?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not according to @Keno who suggested that I move the externally allocated box inside IIRC.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

welcome to the brave new world of julia 0.7

if ccall(:ios_peekutf8, Cint, (Ptr{Void}, Ptr{UInt32}), s, chref) < 0
return typemax(Char)
end
return _chtmp[]
return Char(chref[])
end

function peek(s::IOStream)
ccall(:ios_peekc, Cint, (Ptr{Void},), s)
end

function peek(s::IO)
mark(s)
try read(s, UInt8)
finally
reset(s)
end
end
4 changes: 2 additions & 2 deletions base/parse.jl
Original file line number Diff line number Diff line change
Expand Up @@ -224,12 +224,12 @@ end
## string to float functions ##

tryparse(::Type{Float64}, s::String) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s, 0, sizeof(s))
tryparse(::Type{Float64}, s::SubString{String}) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.endof)
tryparse(::Type{Float64}, s::SubString{String}) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.ncodeunits)
tryparse_internal(::Type{Float64}, s::String, startpos::Int, endpos::Int) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s, startpos-1, endpos-startpos+1)
tryparse_internal(::Type{Float64}, s::SubString{String}, startpos::Int, endpos::Int) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset+startpos-1, endpos-startpos+1)

tryparse(::Type{Float32}, s::String) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s, 0, sizeof(s))
tryparse(::Type{Float32}, s::SubString{String}) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.endof)
tryparse(::Type{Float32}, s::SubString{String}) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.ncodeunits)
tryparse_internal(::Type{Float32}, s::String, startpos::Int, endpos::Int) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s, startpos-1, endpos-startpos+1)
tryparse_internal(::Type{Float32}, s::SubString{String}, startpos::Int, endpos::Int) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset+startpos-1, endpos-startpos+1)

Expand Down
Loading