Skip to content

Commit

Permalink
Update Lindera to 0.32.2 and Tantivy to 0.22.0 (#84)
Browse files Browse the repository at this point in the history
* update lindera to 0.32.2 and tantivy to 0.22.0

* update readme example
  • Loading branch information
lolepop authored Nov 19, 2024
1 parent 2091a7b commit e276de1
Show file tree
Hide file tree
Showing 8 changed files with 54 additions and 116 deletions.
16 changes: 6 additions & 10 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "lindera-tantivy"
version = "0.27.1"
version = "0.32.2"
edition = "2021"
description = "Lindera Tokenizer for Tantivy."
documentation = "https://docs.rs/lindera-tantivy"
Expand All @@ -17,21 +17,17 @@ ipadic = ["lindera-tokenizer/ipadic"] # Japanese dictionary
unidic = ["lindera-tokenizer/unidic"] # Japanese dictionary
ko-dic = ["lindera-tokenizer/ko-dic"] # Korean dictionary
cc-cedict = ["lindera-tokenizer/cc-cedict"] # Chinese dictionary
ipadic-compress = ["lindera-tokenizer/ipadic-compress"]
unidic-compress = ["lindera-tokenizer/unidic-compress"]
ko-dic-compress = ["lindera-tokenizer/ko-dic-compress"]
cc-cedict-compress = ["lindera-tokenizer/cc-cedict-compress"]

[dependencies]
tantivy-tokenizer-api = "0.2.0"
tantivy-tokenizer-api = "0.3.0"

lindera-core = "0.27.0"
lindera-dictionary = "0.27.0"
lindera-tokenizer = "0.27.0"
lindera-core = "0.32.2"
lindera-dictionary = "0.32.2"
lindera-tokenizer = "0.32.2"

[dev-dependencies]
criterion = { version = "0.5.1", features = ["html_reports"] }
tantivy = "0.21.1"
tantivy = "0.22.0"

[[bench]]
name = "bench"
Expand Down
32 changes: 14 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,27 +12,23 @@ The following example enables IPADIC.

```
[dependencies]
lindera-core = "0.24.0"
lindera-dictionary = "0.24.0"
lindera-tantivy = { version = "0.24.0", features = ["ipadic"] }
lindera-core = "0.32.2"
lindera-dictionary = "0.32.2"
lindera-tantivy = { version = "0.32.2", features = ["ipadic"] }
```

### Basic example

```rust
use tantivy::{
collector::TopDocs,
doc,
query::QueryParser,
schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions},
Index,
};

use lindera_core::mode::Mode;
use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind};
use lindera_tantivy::tokenizer::LinderaTokenizer;

fn main() -> tantivy::Result<()> {
use tantivy::{
collector::TopDocs, doc, query::QueryParser, schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, Document, Index, TantivyDocument
};

use lindera_core::mode::Mode;
use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind};
use lindera_tantivy::tokenizer::LinderaTokenizer;

// create schema builder
let mut schema_builder = Schema::builder();

Expand Down Expand Up @@ -83,7 +79,7 @@ fn main() -> tantivy::Result<()> {
kind: Some(DictionaryKind::IPADIC),
path: None,
};
let dictionary = load_dictionary(dictionary_config).unwrap();
let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap();
let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);

// register Lindera tokenizer
Expand Down Expand Up @@ -134,8 +130,8 @@ fn main() -> tantivy::Result<()> {
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
println!("Search Result:");
for (_, doc_address) in top_docs {
let retrieved_doc = searcher.doc(doc_address)?;
println!("{}", schema.to_json(&retrieved_doc));
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("{}", retrieved_doc.to_json(&schema));
}

Ok(())
Expand Down
4 changes: 2 additions & 2 deletions benches/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ fn bench_indexing(c: &mut Criterion) {
use tantivy::Index;

use lindera_core::mode::Mode;
use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind};
use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind};
use lindera_tantivy::tokenizer::LinderaTokenizer;

// create schema builder
Expand Down Expand Up @@ -58,7 +58,7 @@ fn bench_indexing(c: &mut Criterion) {
kind: Some(DictionaryKind::IPADIC),
path: None,
};
let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap();
let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);

// register Lindera tokenizer
Expand Down
14 changes: 5 additions & 9 deletions examples/cc-cedict.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,11 @@
#[cfg(feature = "cc-cedict")]
fn main() -> tantivy::Result<()> {
use tantivy::{
collector::TopDocs,
doc,
query::QueryParser,
schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions},
Index,
collector::TopDocs, doc, query::QueryParser, schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, Document, Index, TantivyDocument
};

use lindera_core::mode::Mode;
use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind};
use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind};
use lindera_tantivy::tokenizer::LinderaTokenizer;

// create schema builder
Expand Down Expand Up @@ -62,7 +58,7 @@ fn main() -> tantivy::Result<()> {
kind: Some(DictionaryKind::CcCedict),
path: None,
};
let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap();
let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);

// register Lindera tokenizer
Expand Down Expand Up @@ -113,8 +109,8 @@ fn main() -> tantivy::Result<()> {
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
println!("Search Result:");
for (_, doc_address) in top_docs {
let retrieved_doc = searcher.doc(doc_address)?;
println!("{}", schema.to_json(&retrieved_doc));
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("{}", retrieved_doc.to_json(&schema));
}

Ok(())
Expand Down
14 changes: 5 additions & 9 deletions examples/ipadic.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,11 @@
#[cfg(feature = "ipadic")]
fn main() -> tantivy::Result<()> {
use tantivy::{
collector::TopDocs,
doc,
query::QueryParser,
schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions},
Index,
collector::TopDocs, doc, query::QueryParser, schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, Document, Index, TantivyDocument
};

use lindera_core::mode::Mode;
use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind};
use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind};
use lindera_tantivy::tokenizer::LinderaTokenizer;

// create schema builder
Expand Down Expand Up @@ -62,7 +58,7 @@ fn main() -> tantivy::Result<()> {
kind: Some(DictionaryKind::IPADIC),
path: None,
};
let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap();
let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);

// register Lindera tokenizer
Expand Down Expand Up @@ -113,8 +109,8 @@ fn main() -> tantivy::Result<()> {
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
println!("Search Result:");
for (_, doc_address) in top_docs {
let retrieved_doc = searcher.doc(doc_address)?;
println!("{}", schema.to_json(&retrieved_doc));
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("{}", retrieved_doc.to_json(&schema));
}

Ok(())
Expand Down
14 changes: 5 additions & 9 deletions examples/ko-dic.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,11 @@
#[cfg(feature = "ko-dic")]
fn main() -> tantivy::Result<()> {
use tantivy::{
collector::TopDocs,
doc,
query::QueryParser,
schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions},
Index,
collector::TopDocs, doc, query::QueryParser, schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, Document, Index, TantivyDocument
};

use lindera_core::mode::Mode;
use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind};
use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind};
use lindera_tantivy::tokenizer::LinderaTokenizer;

// create schema builder
Expand Down Expand Up @@ -62,7 +58,7 @@ fn main() -> tantivy::Result<()> {
kind: Some(DictionaryKind::KoDic),
path: None,
};
let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap();
let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);

// register Lindera tokenizer
Expand Down Expand Up @@ -113,8 +109,8 @@ fn main() -> tantivy::Result<()> {
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
println!("Search Result:");
for (_, doc_address) in top_docs {
let retrieved_doc = searcher.doc(doc_address)?;
println!("{}", schema.to_json(&retrieved_doc));
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("{}", retrieved_doc.to_json(&schema));
}

Ok(())
Expand Down
14 changes: 5 additions & 9 deletions examples/unidic.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,11 @@
#[cfg(feature = "unidic")]
fn main() -> tantivy::Result<()> {
use tantivy::{
collector::TopDocs,
doc,
query::QueryParser,
schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions},
Index,
collector::TopDocs, doc, query::QueryParser, schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, Document, Index, TantivyDocument
};

use lindera_core::mode::Mode;
use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind};
use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind};
use lindera_tantivy::tokenizer::LinderaTokenizer;

// create schema builder
Expand Down Expand Up @@ -62,7 +58,7 @@ fn main() -> tantivy::Result<()> {
kind: Some(DictionaryKind::UniDic),
path: None,
};
let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap();
let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);

// register Lindera tokenizer
Expand Down Expand Up @@ -113,8 +109,8 @@ fn main() -> tantivy::Result<()> {
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
println!("Search Result:");
for (_, doc_address) in top_docs {
let retrieved_doc = searcher.doc(doc_address)?;
println!("{}", schema.to_json(&retrieved_doc));
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("{}", retrieved_doc.to_json(&schema));
}

Ok(())
Expand Down
62 changes: 12 additions & 50 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,17 +49,16 @@ mod tests {
use tantivy_tokenizer_api::{Token, TokenStream, Tokenizer};

use lindera_core::mode::Mode;
use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind};
use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind};

use super::LinderaTokenizer;

#[cfg(feature = "ipadic")]
fn token_stream_helper_ipadic(text: &str) -> Vec<Token> {
fn token_stream_helper(text: &str, dictionary_kind: DictionaryKind) -> Vec<Token> {
let dictionary_config = DictionaryConfig {
kind: Some(DictionaryKind::IPADIC),
kind: Some(dictionary_kind),
path: None,
};
let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap();
let mut tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);

let mut token_stream = tokenizer.token_stream(text);
Expand All @@ -72,61 +71,24 @@ mod tests {
tokens
}

#[cfg(feature = "ipadic")]
fn token_stream_helper_ipadic(text: &str) -> Vec<Token> {
token_stream_helper(text, DictionaryKind::IPADIC)
}

#[cfg(feature = "unidic")]
fn token_stream_helper_unidic(text: &str) -> Vec<Token> {
let dictionary_config = DictionaryConfig {
kind: Some(DictionaryKind::UniDic),
path: None,
};
let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
let mut tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);

let mut token_stream = tokenizer.token_stream(text);
let mut tokens: Vec<Token> = vec![];
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
token_stream.process(&mut add_token);

tokens
token_stream_helper(text, DictionaryKind::UniDic)
}

#[cfg(feature = "ko-dic")]
fn token_stream_helper_kodic(text: &str) -> Vec<Token> {
let dictionary_config = DictionaryConfig {
kind: Some(DictionaryKind::KoDic),
path: None,
};
let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
let mut tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);

let mut token_stream = tokenizer.token_stream(text);
let mut tokens: Vec<Token> = vec![];
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
token_stream.process(&mut add_token);

tokens
token_stream_helper(text, DictionaryKind::KoDic)
}

#[cfg(feature = "cc-cedict")]
fn token_stream_helper_cccedict(text: &str) -> Vec<Token> {
let dictionary_config = DictionaryConfig {
kind: Some(DictionaryKind::CcCedict),
path: None,
};
let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
let mut tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);

let mut token_stream = tokenizer.token_stream(text);
let mut tokens: Vec<Token> = vec![];
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
token_stream.process(&mut add_token);

tokens
token_stream_helper(text, DictionaryKind::CcCedict)
}

/// This is a function that can be used in tests and doc tests
Expand Down

0 comments on commit e276de1

Please sign in to comment.