diff --git a/NEWS.md b/NEWS.md index 7b29596268ca7..926d4f4815a19 100644 --- a/NEWS.md +++ b/NEWS.md @@ -100,6 +100,8 @@ Library improvements * Efficient `mean` and `median` for ranges ([#8089]). + * `graphemes(s)` returns an iterator over grapheme substrings of `s` ([#9261]). + * Character predicates such as `islower()`, `isspace()`, etc. use utf8proc/libmojibake to provide uniform cross-platform behavior and up-to-date, locale-independent support for Unicode standards ([#5939]). @@ -1132,4 +1134,6 @@ Too numerous to mention. [#9133]: https://github.com/JuliaLang/julia/issues/9133 [#9144]: https://github.com/JuliaLang/julia/issues/9144 [#9249]: https://github.com/JuliaLang/julia/issues/9249 +[#9261]: https://github.com/JuliaLang/julia/issues/9261 [#9271]: https://github.com/JuliaLang/julia/issues/9271 +[#9294]: https://github.com/JuliaLang/julia/issues/9294 diff --git a/base/exports.jl b/base/exports.jl index f9cad968ff2b0..1d0c6fd48c051 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -822,6 +822,7 @@ export escape_string, float32_isvalid, float64_isvalid, + graphemes, hex, hex2bytes, ind2chr, diff --git a/base/string.jl b/base/string.jl index b16d7ccffeba2..80883e6fb5326 100644 --- a/base/string.jl +++ b/base/string.jl @@ -1729,4 +1729,3 @@ pointer{T<:ByteString}(x::SubString{T}, i::Integer) = pointer(x.string.data) + x pointer(x::Union(UTF16String,UTF32String), i::Integer) = pointer(x)+(i-1)*sizeof(eltype(x.data)) pointer{T<:Union(UTF16String,UTF32String)}(x::SubString{T}) = pointer(x.string.data) + x.offset*sizeof(eltype(x.data)) pointer{T<:Union(UTF16String,UTF32String)}(x::SubString{T}, i::Integer) = pointer(x.string.data) + (x.offset + (i-1))*sizeof(eltype(x.data)) - diff --git a/base/utf8.jl b/base/utf8.jl index b8d68aa4d3643..73c08cc4c0ff6 100644 --- a/base/utf8.jl +++ b/base/utf8.jl @@ -114,7 +114,7 @@ function getindex(s::UTF8String, r::UnitRange{Int}) if !is_utf8_start(d[i]) i = nextind(s,i) end - if j > endof(s) + if j > length(d) throw(BoundsError()) end j = nextind(s,j)-1 diff --git a/base/utf8proc.jl b/base/utf8proc.jl index 4d177c0441b72..1a80b7b6896c0 100644 --- a/base/utf8proc.jl +++ b/base/utf8proc.jl @@ -1,10 +1,12 @@ # Various Unicode functionality from the utf8proc library module UTF8proc -import Base: show, showcompact, ==, string, symbol, isless +import Base: show, showcompact, ==, hash, string, symbol, isless, length, eltype, start, next, done, convert + +export isgraphemebreak # also exported by Base: -export normalize_string, is_valid_char, is_assigned_char, +export normalize_string, graphemes, is_valid_char, is_assigned_char, islower, isupper, isalpha, isdigit, isnumber, isalnum, iscntrl, ispunct, isspace, isprint, isgraph, isblank @@ -60,6 +62,8 @@ const UTF8PROC_CHARBOUND = (1<<11) const UTF8PROC_LUMP = (1<<12) const UTF8PROC_STRIPMARK = (1<<13) +############################################################################ + let const p = Array(Ptr{UInt8}, 1) global utf8proc_map @@ -110,6 +114,8 @@ function normalize_string(s::AbstractString, nf::Symbol) throw(ArgumentError(":$nf is not one of :NFC, :NFD, :NFKC, :NFKD"))) end +############################################################################ + # returns UTF8PROC_CATEGORY code in 1:30 giving Unicode category function category_code(c) uint32(c) > 0x10FFFF && return 0x0000 # see utf8proc_get_property docs @@ -118,8 +124,6 @@ end is_assigned_char(c) = category_code(c) != UTF8PROC_CATEGORY_CN -# TODO: use UTF8PROC_CHARBOUND to extract graphemes from a string, e.g. to iterate over graphemes? - ## libc character class predicates ## islower(c::Char) = (category_code(c) == UTF8PROC_CATEGORY_LL) @@ -168,4 +172,54 @@ for name = ("alnum", "alpha", "cntrl", "digit", "number", "graph", end end +############################################################################ +# iterators for grapheme segmentation + +isgraphemebreak(c1::Char, c2::Char) = + ccall(:utf8proc_grapheme_break, Bool, (Char, Char), c1, c2) + +immutable GraphemeIterator{S<:AbstractString} + s::S # original string (for generation of SubStrings) +end +graphemes(s::AbstractString) = GraphemeIterator{typeof(s)}(s) + +eltype{S}(::GraphemeIterator{S}) = SubString{S} + +function length(g::GraphemeIterator) + c0 = Char(0x00ad) # soft hyphen (grapheme break always allowed after this) + n = 0 + for c in g.s + n += isgraphemebreak(c0, c) + c0 = c + end + return n +end + +start(g::GraphemeIterator) = start(g.s) +done(g::GraphemeIterator, i) = done(g.s, i) + +function next(g::GraphemeIterator, i) + s = g.s + j = i + c0, k = next(s, i) + while !done(s, k) # loop until next grapheme is s[i:j] + c, ℓ = next(s, k) + isgraphemebreak(c0, c) && break + j = k + k = ℓ + c0 = c + end + return (s[i:j], k) +end + +==(g1::GraphemeIterator, g2::GraphemeIterator) = g1.s == g2.s +hash(g::GraphemeIterator, h::UInt) = hash(g.s, h) +isless(g1::GraphemeIterator, g2::GraphemeIterator) = isless(g1.s, g2.s) + +convert{S<:AbstractString}(::Type{S}, g::GraphemeIterator) = convert(S, g.s) + +show{S}(io::IO, g::GraphemeIterator{S}) = print(io, "length-$(length(g)) GraphemeIterator{$S} for \"$(g.s)\"") + +############################################################################ + end # module diff --git a/deps/libmojibake b/deps/libmojibake index df71da45dfbdf..86447ad060d6f 160000 --- a/deps/libmojibake +++ b/deps/libmojibake @@ -1 +1 @@ -Subproject commit df71da45dfbdf68bcc6fd656d1260d609c728ad7 +Subproject commit 86447ad060d6f4edf01f2a64b9598dfeeb6e6f7d diff --git a/doc/stdlib/base.rst b/doc/stdlib/base.rst index 53ad73c672274..8a6294814780d 100644 --- a/doc/stdlib/base.rst +++ b/doc/stdlib/base.rst @@ -1417,6 +1417,14 @@ Strings For example, NFKC corresponds to the options ``compose=true, compat=true, stable=true``. +.. function:: graphemes(s) -> iterator over substrings of s + + Returns an iterator over substrings of ``s`` that correspond to + the extended graphemes in the string, as defined by Unicode UAX #29. + (Roughly, these are what users would perceive as single characters, + even though they may contain more than one codepoint; for example + a letter combined with an accent mark is a single grapheme.) + .. function:: is_valid_ascii(s) -> Bool Returns true if the argument (``ASCIIString``, ``UTF8String``, or byte vector) is valid ASCII, false otherwise. diff --git a/test/strings.jl b/test/strings.jl index 22f7a85210eae..c33b1e1642003 100644 --- a/test/strings.jl +++ b/test/strings.jl @@ -1267,6 +1267,11 @@ Base.done(jt::i9178, n) = (jt.ndone += 1 ; n > 3) Base.next(jt::i9178, n) = (jt.nnext += 1 ; ("$(jt.nnext),$(jt.ndone)", n+1)) @test join(i9178(0,0), ";") == "1,1;2,2;3,3;4,4" +# make sure substrings handle last code unit even if not start of codepoint +let s = "x\u0302" + @test s[1:3] == s +end + # reverseind for T in (ASCIIString, UTF8String, UTF16String, UTF32String) for prefix in ("", "abcd", "\U0001d6a4\U0001d4c1", "\U0001d6a4\U0001d4c1c", " \U0001d6a4\U0001d4c1") @@ -1288,4 +1293,4 @@ for T in (ASCIIString, UTF8String, UTF16String, UTF32String) end end end -end \ No newline at end of file +end diff --git a/test/unicode.jl b/test/unicode.jl index 4152800f331a0..72658ed349d41 100644 --- a/test/unicode.jl +++ b/test/unicode.jl @@ -93,9 +93,35 @@ else end # check utf8proc handling of CN category constants - let c_ll = 'β', c_cn = '\u038B' @test Base.UTF8proc.category_code(c_ll) == Base.UTF8proc.UTF8PROC_CATEGORY_LL # check codepoint with category code CN @test Base.UTF8proc.category_code(c_cn) == Base.UTF8proc.UTF8PROC_CATEGORY_CN end + +# graphemes +let grphtest = (("b\u0300lahβlahb\u0302láh", ["b\u0300","l","a","h", + "β","l","a","h", + "b\u0302","l","á","h"]), + ("", UTF8String[]), + ("x\u0302", ["x\u0302"]), + ("\U1d4c1\u0302", ["\U1d4c1\u0302"]), + ("\U1d4c1\u0302\U1d4c1\u0300", ["\U1d4c1\u0302", + "\U1d4c1\u0300"]), + ("x",["x"]), + ("abc",["a","b","c"])) + for T in (utf8,utf16,utf32) + for nf in (:NFC, :NFD) + for (s, g) in grphtest + s_ = T(normalize_string(s, nf)) + g_ = map(s -> normalize_string(s, nf), g) + grph = collect(graphemes(s_)) + @test grph == g_ + @test length(graphemes(s_)) == length(grph) + end + S = [T(normalize_string(s)) for (s,g) in grphtest] + G = map(graphemes, S) + @test map(graphemes, sort!(S)) == sort!(G) + end + end +end