fix(textprocessing): improve tokenizer

guo-yong-zhi · Oct 11, 2024 · fd44808 · fd44808
1 parent 43ec72f
commit fd44808
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 5 deletions.
diff --git a/src/textprocessing.jl b/src/textprocessing.jl
@@ -47,12 +47,12 @@ function lemmatize!(d::AbstractDict, lemmatizer)
 end
 
 function tokenizer(text::AbstractString, regexp=r"\w+")
-    [text[i] for i in findall(regexp, text)]
+    (m.match for m in eachmatch(regexp, text))
 end
 
-function tokenizer_eng(text::AbstractString, regexp=r"\b\w+(?:'\w+)*\b")
-    indices = findall(regexp, text)
-    [endswith(text[i], "'s") ? text[i][1:prevind(text[i], end, 2)] : text[i] for i in indices]
+function tokenizer_eng(text::AbstractString, regexp=r"\b[\w']+\b")
+    ms = eachmatch(regexp, text)
+    (endswith(m.match, "'s") ? m.match[1:prevind(m.match, end, 2)] : m.match for m in ms)
 end
 
 # ISO 639-3 macrolanguages
@@ -125,9 +125,9 @@ function countwords(text::AbstractString; language=:auto, kargs...)
     tokenizer_ = get(TOKENIZERS, language, TOKENIZERS["_default_"])
     countwords(tokenizer_(text); language=language, kargs...)
 end
-countwords(words::AbstractVector{<:AbstractString}; kargs...) = countwords(words, Iterators.repeated(1); kargs...)
 countwords(counter::AbstractDict{<:AbstractString,<:Real}; kargs...) = countwords(keys(counter), values(counter); kargs...)
 countwords(wordscounts::Tuple; kargs...) = countwords(wordscounts...; kargs...)
+countwords(words; kargs...) = countwords(words, Iterators.repeated(1); kargs...)
 function countwords(counter::AbstractVector{<:Union{Pair,Tuple,AbstractVector}}; kargs...)
     countwords(first.(counter), [v[2] for v in counter]; kargs...)
 end

diff --git a/test/test_textprocessing.jl b/test/test_textprocessing.jl
@@ -9,6 +9,19 @@
     words, weights = WordCloud.TextProcessing.processtext(c)
     @test !("to" in words) # stopwords
 
+    tokenizer_eng = WordCloud.TextProcessing.tokenizer_eng
+    tokenizer_default = WordCloud.TextProcessing.tokenizer
+    @test tokenizer_default(" a man の 书本\n 1234") .|> strip == ["a", "man", "の", "书本", "1234"]
+    @test tokenizer_eng(" a book in 1994\n") .|> strip == ["a", "book", "in", "1994"]
+    @test tokenizer_eng(" the 'best-book' in 1994\n") .|> strip == ["the", "best", "book", "in", "1994"]
+    @test tokenizer_eng("")|>collect == tokenizer_eng(" ")|>collect == tokenizer_eng(" ,")|>collect == []
+    @test tokenizer_eng(" a _int_var3") .|> strip == ["a", "_int_var3"]
+    @test tokenizer_eng("bob's book") .|> strip == ["bob", "book"]
+    @test tokenizer_eng("bob's 'book' 'book'") .|> strip == ["bob", "book", "book"]
+    @test tokenizer_eng("abc'de fg'h'ij k'l") .|> strip == ["abc'de", "fg'h'ij", "k'l"]
+    @test tokenizer_eng("abc'de', fg'h'ij' k'l'") .|> strip == ["abc'de", "fg'h'ij", "k'l"]
+    @test tokenizer_eng(" abc'de'. fg'h'ij',k'l'") .|> strip == ["abc'de", "fg'h'ij", "k'l"]
+
     lemmatizer_eng = WordCloud.TextProcessing.lemmatizer_eng
     lemmatize! = WordCloud.TextProcessing.lemmatize!
     @test lemmatizer_eng("Cars") == "Car"