Skip to content

Commit

Permalink
fix(textprocessing): improve tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
guo-yong-zhi committed Oct 11, 2024
1 parent 43ec72f commit fd44808
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 5 deletions.
10 changes: 5 additions & 5 deletions src/textprocessing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,12 @@ function lemmatize!(d::AbstractDict, lemmatizer)
end

function tokenizer(text::AbstractString, regexp=r"\w+")
[text[i] for i in findall(regexp, text)]
(m.match for m in eachmatch(regexp, text))
end

function tokenizer_eng(text::AbstractString, regexp=r"\b\w+(?:'\w+)*\b")
indices = findall(regexp, text)
[endswith(text[i], "'s") ? text[i][1:prevind(text[i], end, 2)] : text[i] for i in indices]
function tokenizer_eng(text::AbstractString, regexp=r"\b[\w']+\b")
ms = eachmatch(regexp, text)
(endswith(m.match, "'s") ? m.match[1:prevind(m.match, end, 2)] : m.match for m in ms)
end

# ISO 639-3 macrolanguages
Expand Down Expand Up @@ -125,9 +125,9 @@ function countwords(text::AbstractString; language=:auto, kargs...)
tokenizer_ = get(TOKENIZERS, language, TOKENIZERS["_default_"])
countwords(tokenizer_(text); language=language, kargs...)
end
countwords(words::AbstractVector{<:AbstractString}; kargs...) = countwords(words, Iterators.repeated(1); kargs...)
countwords(counter::AbstractDict{<:AbstractString,<:Real}; kargs...) = countwords(keys(counter), values(counter); kargs...)
countwords(wordscounts::Tuple; kargs...) = countwords(wordscounts...; kargs...)
countwords(words; kargs...) = countwords(words, Iterators.repeated(1); kargs...)
function countwords(counter::AbstractVector{<:Union{Pair,Tuple,AbstractVector}}; kargs...)
countwords(first.(counter), [v[2] for v in counter]; kargs...)
end
Expand Down
13 changes: 13 additions & 0 deletions test/test_textprocessing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,19 @@
words, weights = WordCloud.TextProcessing.processtext(c)
@test !("to" in words) # stopwords

tokenizer_eng = WordCloud.TextProcessing.tokenizer_eng
tokenizer_default = WordCloud.TextProcessing.tokenizer
@test tokenizer_default(" a man の 书本\n 1234") .|> strip == ["a", "man", "", "书本", "1234"]
@test tokenizer_eng(" a book in 1994\n") .|> strip == ["a", "book", "in", "1994"]
@test tokenizer_eng(" the 'best-book' in 1994\n") .|> strip == ["the", "best", "book", "in", "1994"]
@test tokenizer_eng("")|>collect == tokenizer_eng(" ")|>collect == tokenizer_eng(" ,")|>collect == []
@test tokenizer_eng(" a _int_var3") .|> strip == ["a", "_int_var3"]
@test tokenizer_eng("bob's book") .|> strip == ["bob", "book"]
@test tokenizer_eng("bob's 'book' 'book'") .|> strip == ["bob", "book", "book"]
@test tokenizer_eng("abc'de fg'h'ij k'l") .|> strip == ["abc'de", "fg'h'ij", "k'l"]
@test tokenizer_eng("abc'de', fg'h'ij' k'l'") .|> strip == ["abc'de", "fg'h'ij", "k'l"]
@test tokenizer_eng(" abc'de'. fg'h'ij',k'l'") .|> strip == ["abc'de", "fg'h'ij", "k'l"]

lemmatizer_eng = WordCloud.TextProcessing.lemmatizer_eng
lemmatize! = WordCloud.TextProcessing.lemmatize!
@test lemmatizer_eng("Cars") == "Car"
Expand Down

0 comments on commit fd44808

Please sign in to comment.