Skip to content

Commit

Permalink
Add tokenizers bench.
Browse files Browse the repository at this point in the history
  • Loading branch information
fmassot committed Jul 6, 2023
1 parent 8f2c663 commit 234e47f
Show file tree
Hide file tree
Showing 4 changed files with 176 additions and 2 deletions.
3 changes: 2 additions & 1 deletion quickwit/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions quickwit/quickwit-query/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ thiserror = { workspace = true }
whichlang = { workspace = true, optional = true }

[dev-dependencies]
criterion = { workspace = true }
proptest = { workspace = true }
time = { workspace = true }

Expand All @@ -41,3 +42,7 @@ multilang = [
testsuite = [
"multilang",
]

[[bench]]
name = "tokenizers_bench"
harness = false
168 changes: 168 additions & 0 deletions quickwit/quickwit-query/benches/tokenizers_bench.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
// Copyright (C) 2023 Quickwit, Inc.
//
// Quickwit is offered under the AGPL v3.0 and as commercial software.
// For commercial licensing, contact us at [email protected].
//
// AGPL:
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.

use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput};
use quickwit_query::create_default_quickwit_tokenizer_manager;
use tantivy::tokenizer::TextAnalyzer;
use tantivy_tokenizer_api::Token;

// A random ascii string of length 100 chars.
const ASCII_SHORT: &str = "It is a long established fact";
static ASCII_LONG: &str = r#"It is a long established fact that a reader will be distracted by the readable content of a
page when looking at its layout. The point of using Lorem Ipsum is that it has a
more-or-less normal distribution of letters, as opposed to using 'Content here, content
here', making it look like readable English. Many desktop publishing packages and web page
editors now use Lorem Ipsum as their default model text, and a search for 'lorem ipsum' will
uncover many web sites still in their infancy. Various versions have evolved over the years,
sometimes by accident, sometimes on purpose (injected humour and the like)."#;
const JPN_SHORT: &str = "日本ごです。 とても素敵な言葉ですね";
const JPN_LONG: &str = r#"日本ごです。 和名の由来は、
太陽の動きにつれてその方向を追うように花が回るといわれたことから。
ただしこの動きは生長に伴うものであるため、
実際に太陽を追って動くのは生長が盛んな若い時期だけである。
若いヒマワリの茎の上部の葉は太陽に正対になるように動き、
朝には東を向いていたのが夕方には西を向く。日没後はまもなく起きあがり、
夜明け前にはふたたび東に向く。この運動はつぼみを付ける頃まで続くが、
つぼみが大きくなり花が開く素敵な言葉ですね."#;
const CMN_SHORT: &str = "滚滚长江东逝水,浪花淘尽英雄。";
const CMN_LONG: &str = r#"滚滚长江东逝水,浪花淘尽英雄。是非成败转头空,青山依旧在,几度夕阳红。
白发渔樵江渚上,惯看秋月春风。一壶浊酒喜相逢,古今多少事,都付笑谈中。
是非成败转头空,青山依旧在,惯看秋月春风。一壶浊酒喜相逢,古今多少事,
滚滚长江东逝水,浪花淘尽英雄。 几度夕阳红。白发渔樵江渚上,都付笑谈中。"#;
const KOR_SHORT: &str = "안녕하세요. 반갑습니다.";
const KOR_LONG: &str = r#"
포근히 내려오는 눈밭속에서는
낯이 붉은 處女아이들도 깃들이어 오는 소리…
울고
웃고
수구리고
새파라니 얼어서
運命들이 모두다 안끼어 드는 소리…
큰놈에겐 큰 눈물자국, 작은놈에겐 작은 웃음 흔적
큰이얘기 작은이얘기들이 오부록이 도란 그리며 안끼어 오는 소리
끊임없이 내리는 눈발 속에서는
山도 山도 靑山도 안끼어 드는 소리
"#;

fn process_tokens(analyzer: &mut TextAnalyzer, text: &str) -> Vec<Token> {
let mut token_stream = analyzer.token_stream(text);
let mut tokens: Vec<Token> = vec![];
token_stream.process(&mut |token: &Token| tokens.push(token.clone()));
tokens
}

pub fn criterion_benchmark(c: &mut Criterion) {
let mut group = c.benchmark_group("multilang");
let tokenizer_manager = create_default_quickwit_tokenizer_manager();
let mut default_tokenizer = tokenizer_manager.get("default").unwrap();
let mut multilang_tokenizer = tokenizer_manager.get("multilang").unwrap();
let mut chinese_tokenizer = tokenizer_manager.get("chinese_compatible").unwrap();

group
.throughput(Throughput::Bytes(ASCII_SHORT.len() as u64))
.bench_with_input("default-tokenize-short", ASCII_SHORT, |b, text| {
b.iter(|| process_tokens(&mut default_tokenizer, black_box(text)));
});
group
.throughput(Throughput::Bytes(ASCII_LONG.len() as u64))
.bench_with_input("default-tokenize-long", ASCII_LONG, |b, text| {
b.iter(|| process_tokens(&mut default_tokenizer, black_box(text)));
});
group
.throughput(Throughput::Bytes(ASCII_SHORT.len() as u64))
.bench_with_input("multilang-eng-tokenize-short", ASCII_SHORT, |b, text| {
b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text)));
});
group
.throughput(Throughput::Bytes(ASCII_LONG.len() as u64))
.bench_with_input("multilang-eng-tokenize-long", ASCII_LONG, |b, text| {
b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text)));
});
let short_with_prefix = "ENG:".to_string() + ASCII_SHORT;
group
.throughput(Throughput::Bytes(ASCII_SHORT.len() as u64))
.bench_with_input(
"multilang-tokenize-short-with-prefix",
&short_with_prefix,
|b, text| {
b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text)));
},
);
let long_with_prefix = "ENG:".to_string() + ASCII_LONG;
group
.throughput(Throughput::Bytes(ASCII_LONG.len() as u64))
.bench_with_input(
"multilang-tokenize-long-with-prefix",
&long_with_prefix,
|b, text| {
b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text)));
},
);
group
.throughput(Throughput::Bytes(JPN_SHORT.len() as u64))
.bench_with_input("multilang-tokenize-jpn-short", JPN_SHORT, |b, text| {
b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text)));
});
group
.throughput(Throughput::Bytes(JPN_LONG.len() as u64))
.bench_with_input("multilang-tokenize-jpn-long", JPN_LONG, |b, text| {
b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text)));
});
group
.throughput(Throughput::Bytes(CMN_SHORT.len() as u64))
.bench_with_input("multilang-tokenize-cmn-short", CMN_SHORT, |b, text| {
b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text)));
});
group
.throughput(Throughput::Bytes(CMN_LONG.len() as u64))
.bench_with_input("multilang-tokenize-cmn-long", CMN_LONG, |b, text| {
b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text)));
});
group
.throughput(Throughput::Bytes(KOR_SHORT.len() as u64))
.bench_with_input("multilang-tokenize-kor-short", KOR_SHORT, |b, text| {
b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text)));
});
group
.throughput(Throughput::Bytes(KOR_LONG.len() as u64))
.bench_with_input("multilang-tokenize-kor-long", KOR_LONG, |b, text| {
b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text)));
});
group
.throughput(Throughput::Bytes(CMN_SHORT.len() as u64))
.bench_with_input(
"chinese-compatible-tokenize-cmn-short",
CMN_SHORT,
|b, text| {
b.iter(|| process_tokens(&mut chinese_tokenizer, black_box(text)));
},
);
group
.throughput(Throughput::Bytes(CMN_LONG.len() as u64))
.bench_with_input(
"chinese-compatible-tokenize-cmn-long",
CMN_LONG,
|b, text| {
b.iter(|| process_tokens(&mut chinese_tokenizer, black_box(text)));
},
);
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ mod tests {
},
];

assert_eq!(dbg!(res), dbg!(expected));
assert_eq!(res, expected);
}

proptest::proptest! {
Expand Down

0 comments on commit 234e47f

Please sign in to comment.