Skip to content

Commit

Permalink
Add language prefix for multilangauge tokenizer, rename the tokenizer.
Browse files Browse the repository at this point in the history
  • Loading branch information
fmassot committed Apr 13, 2023
1 parent 85f2876 commit 5405a08
Show file tree
Hide file tree
Showing 6 changed files with 199 additions and 61 deletions.
92 changes: 67 additions & 25 deletions quickwit/quickwit-doc-mapper/benches/tokenizers_bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,24 +18,50 @@
// along with this program. If not, see <http://www.gnu.org/licenses/>.

use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput};
use quickwit_doc_mapper::{QUICKWIT_TOKENIZER_MANAGER};
use tantivy::tokenizer::{Token, TextAnalyzer};
use quickwit_doc_mapper::QUICKWIT_TOKENIZER_MANAGER;
use tantivy::tokenizer::{TextAnalyzer, Token};

// A random ascii string of length 100 chars.
const ASCII_SHORT: &str = "It is a long established fact";
const ASCII_MEDIUM: &str = "It is a long established fact that a reader will be distracted by the readable content of a page when looking at its layout. The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here', making it look like readable English. Many desktop publishing packages and web page editors now use Lorem Ipsum as their default model text, and a search for 'lorem ipsum' will uncover many web sites still in their infancy. Various versions have evolved over the years, sometimes by accident, sometimes on purpose (injected humour and the like).";
const ASCII_MEDIUM: &str =
"It is a long established fact that a reader will be distracted by the readable content of a \
page when looking at its layout. The point of using Lorem Ipsum is that it has a \
more-or-less normal distribution of letters, as opposed to using 'Content here, content \
here', making it look like readable English. Many desktop publishing packages and web page \
editors now use Lorem Ipsum as their default model text, and a search for 'lorem ipsum' will \
uncover many web sites still in their infancy. Various versions have evolved over the years, \
sometimes by accident, sometimes on purpose (injected humour and the like).";
const ASCII_WITH_LANG_PREFIX_SHORT: &str = "ENG:it is a long established fact";
const ASCII_WITH_LANG_PREFIX_MEDIUM: &str =
"ENG:It is a long established fact that a reader will be distracted by the readable content \
of a page when looking at its layout. The point of using Lorem Ipsum is that it has a \
more-or-less normal distribution of letters, as opposed to using 'Content here, content \
here', making it look like readable English. Many desktop publishing packages and web page \
editors now use Lorem Ipsum as their default model text, and a search for 'lorem ipsum' will \
uncover many web sites still in their infancy. Various versions have evolved over the years, \
sometimes by accident, sometimes on purpose (injected humour and the like).";
const JP_SHORT: &str = "日本ごです。 とても素敵な言葉ですね";
const JP_MEDIUM: &str = "日本ごです。 和名の由来は、太陽の動きにつれてその方向を追うように花が回るといわれたことから。ただしこの動きは生長に伴うものであるため、実際に太陽を追って動くのは生長が盛んな若い時期だけである。若いヒマワリの茎の上部の葉は太陽に正対になるように動き、朝には東を向いていたのが夕方には西を向く。日没後はまもなく起きあがり、夜明け前にはふたたび東に向く。この運動はつぼみを付ける頃まで続くが、つぼみが大きくなり花が開く素敵な言葉ですね.";
const JP_MEDIUM: &str = "日本ごです。 和名の由来は、\
太陽の動きにつれてその方向を追うように花が回るといわれたことから。\
ただしこの動きは生長に伴うものであるため、\
実際に太陽を追って動くのは生長が盛んな若い時期だけである。\
若いヒマワリの茎の上部の葉は太陽に正対になるように動き、\
朝には東を向いていたのが夕方には西を向く。日没後はまもなく起きあがり、\
夜明け前にはふたたび東に向く。この運動はつぼみを付ける頃まで続くが、\
つぼみが大きくなり花が開く素敵な言葉ですね.";
const CN_SHORT: &str = "滚滚长江东逝水,浪花淘尽英雄。";
const CN_MEDIUM: &str = "滚滚长江东逝水,浪花淘尽英雄。是非成败转头空,青山依旧在,几度夕阳红。白发渔樵江渚上,惯看秋月春风。一壶浊酒喜相逢,古今多少事,都付笑谈中。是非成败转头空,青山依旧在,惯看秋月春风。一壶浊酒喜相逢,古今多少事,滚滚长江东逝水,浪花淘尽英雄。 几度夕阳红。白发渔樵江渚上,都付笑谈中。";

const CN_MEDIUM: &str = "滚滚长江东逝水,浪花淘尽英雄。是非成败转头空,青山依旧在,几度夕阳红。\
白发渔樵江渚上,惯看秋月春风。一壶浊酒喜相逢,古今多少事,都付笑谈中。\
是非成败转头空,青山依旧在,惯看秋月春风。一壶浊酒喜相逢,古今多少事,\
滚滚长江东逝水,浪花淘尽英雄。 几度夕阳红。白发渔樵江渚上,都付笑谈中。";

pub fn criterion_benchmark(c: &mut Criterion) {
let mut group = c.benchmark_group("multilanguage");
let default_tokenizer = QUICKWIT_TOKENIZER_MANAGER.get("default").unwrap();
let multilanguage_tokenizer = QUICKWIT_TOKENIZER_MANAGER.get("multi_language").unwrap();
let chinese_tokenizer = QUICKWIT_TOKENIZER_MANAGER.get("chinese_compatible").unwrap();
let japanese_tokenizer = QUICKWIT_TOKENIZER_MANAGER.get("japanese").unwrap();
let multilanguage_tokenizer = QUICKWIT_TOKENIZER_MANAGER.get("multilanguage").unwrap();
let chinese_tokenizer = QUICKWIT_TOKENIZER_MANAGER
.get("chinese_compatible")
.unwrap();
fn process_tokens(analyzer: &TextAnalyzer, text: &str) -> Vec<Token> {
let mut token_stream = analyzer.token_stream(text);
let mut tokens: Vec<Token> = vec![];
Expand All @@ -62,6 +88,24 @@ pub fn criterion_benchmark(c: &mut Criterion) {
.bench_with_input("multilanguage-tokenize-long", ASCII_MEDIUM, |b, text| {
b.iter(|| process_tokens(&multilanguage_tokenizer, black_box(text)));
});
group
.throughput(Throughput::Bytes(ASCII_SHORT.len() as u64))
.bench_with_input(
"multilanguage-prefix-lang-tokenize-short",
ASCII_WITH_LANG_PREFIX_SHORT,
|b, text| {
b.iter(|| process_tokens(&multilanguage_tokenizer, black_box(text)));
},
);
group
.throughput(Throughput::Bytes(ASCII_MEDIUM.len() as u64))
.bench_with_input(
"multilanguage-prefix-lang-detection-tokenize-long",
ASCII_WITH_LANG_PREFIX_MEDIUM,
|b, text| {
b.iter(|| process_tokens(&multilanguage_tokenizer, black_box(text)));
},
);
group
.throughput(Throughput::Bytes(JP_SHORT.len() as u64))
.bench_with_input("multilanguage-tokenize-jpn-short", JP_SHORT, |b, text| {
Expand All @@ -84,24 +128,22 @@ pub fn criterion_benchmark(c: &mut Criterion) {
});
group
.throughput(Throughput::Bytes(CN_SHORT.len() as u64))
.bench_with_input("chinese-compatible-tokenize-cmn-short", CN_SHORT, |b, text| {
b.iter(|| process_tokens(&chinese_tokenizer, black_box(text)));
});
.bench_with_input(
"chinese-compatible-tokenize-cmn-short",
CN_SHORT,
|b, text| {
b.iter(|| process_tokens(&chinese_tokenizer, black_box(text)));
},
);
group
.throughput(Throughput::Bytes(CN_MEDIUM.len() as u64))
.bench_with_input("chinese-compatible-tokenize-cmn-medium", CN_MEDIUM, |b, text| {
b.iter(|| process_tokens(&chinese_tokenizer, black_box(text)));
});
group
.throughput(Throughput::Bytes(JP_SHORT.len() as u64))
.bench_with_input("japanese-tokenize-cmn-short", JP_SHORT, |b, text| {
b.iter(|| process_tokens(&japanese_tokenizer, black_box(text)));
});
group
.throughput(Throughput::Bytes(JP_MEDIUM.len() as u64))
.bench_with_input("japanese-tokenize-cmn-medium", JP_MEDIUM, |b, text| {
b.iter(|| process_tokens(&japanese_tokenizer, black_box(text)));
});
.bench_with_input(
"chinese-compatible-tokenize-cmn-medium",
CN_MEDIUM,
|b, text| {
b.iter(|| process_tokens(&chinese_tokenizer, black_box(text)));
},
);
}

criterion_group!(benches, criterion_benchmark);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,8 @@ pub enum QuickwitTextTokenizer {
StemEn,
#[serde(rename = "chinese_compatible")]
Chinese,
#[serde(rename = "multi_language")]
MultiLanguage,
#[serde(rename = "multilanguage")]
Multilanguage,
}

impl QuickwitTextTokenizer {
Expand All @@ -149,7 +149,7 @@ impl QuickwitTextTokenizer {
QuickwitTextTokenizer::Default => "default",
QuickwitTextTokenizer::StemEn => "en_stem",
QuickwitTextTokenizer::Chinese => "chinese_compatible",
QuickwitTextTokenizer::MultiLanguage => "multi_language",
QuickwitTextTokenizer::Multilanguage => "multilanguage",
}
}
}
Expand Down Expand Up @@ -637,7 +637,7 @@ mod tests {
assert_eq!(
mapping_entry.unwrap_err().to_string(),
"Error while parsing field `my_field_name`: unknown variant `notexist`, expected one \
of `raw`, `default`, `en_stem`, `chinese_compatible`, `multi_language`"
of `raw`, `default`, `en_stem`, `chinese_compatible`, `multilanguage`"
.to_string()
);
Ok(())
Expand Down
36 changes: 36 additions & 0 deletions quickwit/quickwit-doc-mapper/src/doc_mapper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ mod tests {

use crate::default_doc_mapper::{
FastFieldOptions, FieldMappingType, QuickwitJsonOptions, QuickwitTextOptions,
QuickwitTextTokenizer,
};
use crate::{
Cardinality, DefaultDocMapperBuilder, DocMapper, DocParsingError, FieldMappingEntry,
Expand Down Expand Up @@ -516,4 +517,39 @@ mod tests {
wi_cloned.merge(wi_2);
assert_eq!(wi_cloned, wi_base);
}

#[test]
fn test_doc_mapper_query_with_multilanguage_field() {
let mut doc_mapper_builder = DefaultDocMapperBuilder::default();
doc_mapper_builder.field_mappings.push(FieldMappingEntry {
name: "multilang".to_string(),
mapping_type: FieldMappingType::Text(
QuickwitTextOptions {
tokenizer: Some(QuickwitTextTokenizer::Multilanguage),
..Default::default()
},
Cardinality::SingleValue,
),
});
let doc_mapper = doc_mapper_builder.try_build().unwrap();
let schema = doc_mapper.schema();
let search_request = SearchRequest {
index_id: "quickwit-index".to_string(),
query: "multilang:\"JPN:す\"".to_string(),
search_fields: vec![],
snippet_fields: vec![],
start_timestamp: None,
end_timestamp: None,
max_hits: 10,
start_offset: 0,
sort_order: None,
sort_by_field: None,
aggregation_request: None,
};
let (query, _) = doc_mapper.query(schema, &search_request).unwrap();
assert_eq!(
format!("{query:?}"),
r#"TermQuery(Term(type=Str, field=0, "す"))"#
);
}
}
3 changes: 1 addition & 2 deletions quickwit/quickwit-doc-mapper/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,7 @@ use default_doc_mapper::{
};
pub use doc_mapper::{DocMapper, NamedField, WarmupInfo};
pub use error::{DocParsingError, QueryParserError};
pub use tokenizers::{QUICKWIT_TOKENIZER_MANAGER};
pub use multilanguage_tokenizer::MultiLanguageTokenStream;
pub use tokenizers::QUICKWIT_TOKENIZER_MANAGER;

/// Field name reserved for storing the source document.
pub const SOURCE_FIELD_NAME: &str = "_source";
Expand Down
105 changes: 88 additions & 17 deletions quickwit/quickwit-doc-mapper/src/multilanguage_tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@ use lindera_tantivy::dictionary::load_dictionary;
use lindera_tantivy::stream::LinderaTokenStream;
use lindera_tantivy::tokenizer::LinderaTokenizer;
use lindera_tantivy::{DictionaryConfig, DictionaryKind, Mode};
use nom::InputTake;
use tantivy::tokenizer::{SimpleTokenStream, SimpleTokenizer, Token, TokenStream, Tokenizer};
use whichlang::detect_language;
use whichlang::{detect_language, Lang};

#[derive(Clone)]
pub(crate) struct MultiLanguageTokenizer {
Expand Down Expand Up @@ -57,7 +58,7 @@ impl MultiLanguageTokenizer {
}
}

pub enum MultiLanguageTokenStream<'a> {
pub(crate) enum MultiLanguageTokenStream<'a> {
Lindera(LinderaTokenStream),
Simple(SimpleTokenStream<'a>),
}
Expand Down Expand Up @@ -85,19 +86,41 @@ impl<'a> TokenStream for MultiLanguageTokenStream<'a> {
}
}

/// If language prefix is present, returns the corresponding language and the text without the
/// prefix. If no prefix is present, returns (None, text).
/// The language prefix is defined as `{ID}:text` with ID being the 3-letter language code similar
/// to whichlang language code.
fn process_language_prefix(text: &str) -> (Option<Lang>, &str) {
let prefix_bytes = text.as_bytes().take(std::cmp::min(4, text.len()));
let predefined_language = match prefix_bytes {
b"JPN:" => Some(Lang::Jpn),
b"CMN:" => Some(Lang::Cmn),
b"ENG:" => Some(Lang::Eng),
_ => None,
};
let text_to_tokenize = if predefined_language.is_some() {
&text[4..]
} else {
text
};
(predefined_language, text_to_tokenize)
}

impl Tokenizer for MultiLanguageTokenizer {
type TokenStream<'a> = MultiLanguageTokenStream<'a>;
fn token_stream<'a>(&self, text: &'a str) -> MultiLanguageTokenStream<'a> {
// TODO: let the user defined the language with a prefix like `jpn:こんにちは`
let language = detect_language(text);
let (predefined_language, text_to_tokenize) = process_language_prefix(text);
let language = predefined_language.unwrap_or_else(|| detect_language(text_to_tokenize));
match language {
whichlang::Lang::Cmn => {
MultiLanguageTokenStream::Lindera(self.cmn_tokenizer.token_stream(text))
Lang::Cmn => {
MultiLanguageTokenStream::Lindera(self.cmn_tokenizer.token_stream(text_to_tokenize))
}
whichlang::Lang::Jpn => {
MultiLanguageTokenStream::Lindera(self.jpn_tokenizer.token_stream(text))
Lang::Jpn => {
MultiLanguageTokenStream::Lindera(self.jpn_tokenizer.token_stream(text_to_tokenize))
}
_ => MultiLanguageTokenStream::Simple(self.default_tokenizer.token_stream(text)),
_ => MultiLanguageTokenStream::Simple(
self.default_tokenizer.token_stream(text_to_tokenize),
),
}
}
}
Expand All @@ -107,6 +130,7 @@ mod tests {
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};

use super::{MultiLanguageTokenStream, MultiLanguageTokenizer};
use crate::multilanguage_tokenizer::process_language_prefix;

fn test_helper(mut tokenizer: MultiLanguageTokenStream) -> Vec<Token> {
let mut tokens: Vec<Token> = vec![];
Expand All @@ -117,15 +141,25 @@ mod tests {
#[test]
fn test_multilanguage_tokenizer_jpn() {
let tokenizer = MultiLanguageTokenizer::new();
let tokens = test_helper(tokenizer.token_stream("すもももももももものうち"));
assert_eq!(tokens.len(), 7);
{
let token = &tokens[0];
assert_eq!(token.text, "すもも");
assert_eq!(token.offset_from, 0);
assert_eq!(token.offset_to, 9);
assert_eq!(token.position, 0);
assert_eq!(token.position_length, 1);
let tokens = test_helper(tokenizer.token_stream("すもももももももものうち"));
assert_eq!(tokens.len(), 7);
{
let token = &tokens[0];
assert_eq!(token.text, "すもも");
assert_eq!(token.offset_from, 0);
assert_eq!(token.offset_to, 9);
assert_eq!(token.position, 0);
assert_eq!(token.position_length, 1);
}
}
{
let tokens = test_helper(tokenizer.token_stream("ENG:すもももももももものうち"));
assert_eq!(tokens.len(), 1);
}
{
let tokens = test_helper(tokenizer.token_stream("CMN:すもももももももものうち"));
assert_eq!(tokens.len(), 1);
}
}

Expand All @@ -145,4 +179,41 @@ mod tests {
assert_eq!(token.position_length, 1);
}
}

#[test]
fn test_multilanguage_tokenizer_with_predefined_language() {
{
// Force usage of JPN tokenizer. This tokenizer will not ignore the dash
// wherease the default tokenizer will.
let tokenizer = MultiLanguageTokenizer::new();
let tokens = test_helper(tokenizer.token_stream("JPN:-"));
assert_eq!(tokens.len(), 1);
let tokens = test_helper(tokenizer.token_stream("-"));
assert_eq!(tokens.len(), 0);
}
}

#[test]
fn test_multilanguage_process_predefined_language() {
{
let (lang, text) = process_language_prefix("JPN:すもももももももものうち");
assert_eq!(lang, Some(whichlang::Lang::Jpn));
assert_eq!(text, "すもももももももものうち");
}
{
let (lang, text) = process_language_prefix("CMN:地址1,包含無效的字元");
assert_eq!(lang, Some(whichlang::Lang::Cmn));
assert_eq!(text, "地址1,包含無效的字元");
}
{
let (lang, text) = process_language_prefix("ENG:my address");
assert_eq!(lang, Some(whichlang::Lang::Eng));
assert_eq!(text, "my address");
}
{
let (lang, text) = process_language_prefix("UNK:my address");
assert!(lang.is_none());
assert_eq!(text, "UNK:my address");
}
}
}
Loading

0 comments on commit 5405a08

Please sign in to comment.