From e276de1a405c95a8efd61d89ca4568fedfa80086 Mon Sep 17 00:00:00 2001 From: Lolepopie <8401103+lolepop@users.noreply.github.com> Date: Tue, 19 Nov 2024 16:13:04 +0800 Subject: [PATCH] Update Lindera to 0.32.2 and Tantivy to 0.22.0 (#84) * update lindera to 0.32.2 and tantivy to 0.22.0 * update readme example --- Cargo.toml | 16 +++++------ README.md | 32 ++++++++++------------ benches/bench.rs | 4 +-- examples/cc-cedict.rs | 14 ++++------ examples/ipadic.rs | 14 ++++------ examples/ko-dic.rs | 14 ++++------ examples/unidic.rs | 14 ++++------ src/tokenizer.rs | 62 +++++++++---------------------------------- 8 files changed, 54 insertions(+), 116 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 49ebedc..f72d289 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lindera-tantivy" -version = "0.27.1" +version = "0.32.2" edition = "2021" description = "Lindera Tokenizer for Tantivy." documentation = "https://docs.rs/lindera-tantivy" @@ -17,21 +17,17 @@ ipadic = ["lindera-tokenizer/ipadic"] # Japanese dictionary unidic = ["lindera-tokenizer/unidic"] # Japanese dictionary ko-dic = ["lindera-tokenizer/ko-dic"] # Korean dictionary cc-cedict = ["lindera-tokenizer/cc-cedict"] # Chinese dictionary -ipadic-compress = ["lindera-tokenizer/ipadic-compress"] -unidic-compress = ["lindera-tokenizer/unidic-compress"] -ko-dic-compress = ["lindera-tokenizer/ko-dic-compress"] -cc-cedict-compress = ["lindera-tokenizer/cc-cedict-compress"] [dependencies] -tantivy-tokenizer-api = "0.2.0" +tantivy-tokenizer-api = "0.3.0" -lindera-core = "0.27.0" -lindera-dictionary = "0.27.0" -lindera-tokenizer = "0.27.0" +lindera-core = "0.32.2" +lindera-dictionary = "0.32.2" +lindera-tokenizer = "0.32.2" [dev-dependencies] criterion = { version = "0.5.1", features = ["html_reports"] } -tantivy = "0.21.1" +tantivy = "0.22.0" [[bench]] name = "bench" diff --git a/README.md b/README.md index c7f5362..40e8da6 100644 --- a/README.md +++ b/README.md @@ -12,27 +12,23 @@ The following example enables IPADIC. ``` [dependencies] -lindera-core = "0.24.0" -lindera-dictionary = "0.24.0" -lindera-tantivy = { version = "0.24.0", features = ["ipadic"] } +lindera-core = "0.32.2" +lindera-dictionary = "0.32.2" +lindera-tantivy = { version = "0.32.2", features = ["ipadic"] } ``` ### Basic example ```rust -use tantivy::{ - collector::TopDocs, - doc, - query::QueryParser, - schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, - Index, -}; - -use lindera_core::mode::Mode; -use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind}; -use lindera_tantivy::tokenizer::LinderaTokenizer; - fn main() -> tantivy::Result<()> { + use tantivy::{ + collector::TopDocs, doc, query::QueryParser, schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, Document, Index, TantivyDocument + }; + + use lindera_core::mode::Mode; + use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind}; + use lindera_tantivy::tokenizer::LinderaTokenizer; + // create schema builder let mut schema_builder = Schema::builder(); @@ -83,7 +79,7 @@ fn main() -> tantivy::Result<()> { kind: Some(DictionaryKind::IPADIC), path: None, }; - let dictionary = load_dictionary(dictionary_config).unwrap(); + let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap(); let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal); // register Lindera tokenizer @@ -134,8 +130,8 @@ fn main() -> tantivy::Result<()> { let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; println!("Search Result:"); for (_, doc_address) in top_docs { - let retrieved_doc = searcher.doc(doc_address)?; - println!("{}", schema.to_json(&retrieved_doc)); + let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?; + println!("{}", retrieved_doc.to_json(&schema)); } Ok(()) diff --git a/benches/bench.rs b/benches/bench.rs index 653a19e..0b1cc26 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -8,7 +8,7 @@ fn bench_indexing(c: &mut Criterion) { use tantivy::Index; use lindera_core::mode::Mode; - use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind}; + use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind}; use lindera_tantivy::tokenizer::LinderaTokenizer; // create schema builder @@ -58,7 +58,7 @@ fn bench_indexing(c: &mut Criterion) { kind: Some(DictionaryKind::IPADIC), path: None, }; - let dictionary = load_dictionary_from_config(dictionary_config).unwrap(); + let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap(); let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal); // register Lindera tokenizer diff --git a/examples/cc-cedict.rs b/examples/cc-cedict.rs index 93e4bcd..8ebc94e 100644 --- a/examples/cc-cedict.rs +++ b/examples/cc-cedict.rs @@ -1,15 +1,11 @@ #[cfg(feature = "cc-cedict")] fn main() -> tantivy::Result<()> { use tantivy::{ - collector::TopDocs, - doc, - query::QueryParser, - schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, - Index, + collector::TopDocs, doc, query::QueryParser, schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, Document, Index, TantivyDocument }; use lindera_core::mode::Mode; - use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind}; + use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind}; use lindera_tantivy::tokenizer::LinderaTokenizer; // create schema builder @@ -62,7 +58,7 @@ fn main() -> tantivy::Result<()> { kind: Some(DictionaryKind::CcCedict), path: None, }; - let dictionary = load_dictionary_from_config(dictionary_config).unwrap(); + let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap(); let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal); // register Lindera tokenizer @@ -113,8 +109,8 @@ fn main() -> tantivy::Result<()> { let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; println!("Search Result:"); for (_, doc_address) in top_docs { - let retrieved_doc = searcher.doc(doc_address)?; - println!("{}", schema.to_json(&retrieved_doc)); + let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?; + println!("{}", retrieved_doc.to_json(&schema)); } Ok(()) diff --git a/examples/ipadic.rs b/examples/ipadic.rs index 5717209..332388d 100644 --- a/examples/ipadic.rs +++ b/examples/ipadic.rs @@ -1,15 +1,11 @@ #[cfg(feature = "ipadic")] fn main() -> tantivy::Result<()> { use tantivy::{ - collector::TopDocs, - doc, - query::QueryParser, - schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, - Index, + collector::TopDocs, doc, query::QueryParser, schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, Document, Index, TantivyDocument }; use lindera_core::mode::Mode; - use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind}; + use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind}; use lindera_tantivy::tokenizer::LinderaTokenizer; // create schema builder @@ -62,7 +58,7 @@ fn main() -> tantivy::Result<()> { kind: Some(DictionaryKind::IPADIC), path: None, }; - let dictionary = load_dictionary_from_config(dictionary_config).unwrap(); + let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap(); let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal); // register Lindera tokenizer @@ -113,8 +109,8 @@ fn main() -> tantivy::Result<()> { let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; println!("Search Result:"); for (_, doc_address) in top_docs { - let retrieved_doc = searcher.doc(doc_address)?; - println!("{}", schema.to_json(&retrieved_doc)); + let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?; + println!("{}", retrieved_doc.to_json(&schema)); } Ok(()) diff --git a/examples/ko-dic.rs b/examples/ko-dic.rs index 7a009e7..9ab5c09 100644 --- a/examples/ko-dic.rs +++ b/examples/ko-dic.rs @@ -1,15 +1,11 @@ #[cfg(feature = "ko-dic")] fn main() -> tantivy::Result<()> { use tantivy::{ - collector::TopDocs, - doc, - query::QueryParser, - schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, - Index, + collector::TopDocs, doc, query::QueryParser, schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, Document, Index, TantivyDocument }; use lindera_core::mode::Mode; - use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind}; + use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind}; use lindera_tantivy::tokenizer::LinderaTokenizer; // create schema builder @@ -62,7 +58,7 @@ fn main() -> tantivy::Result<()> { kind: Some(DictionaryKind::KoDic), path: None, }; - let dictionary = load_dictionary_from_config(dictionary_config).unwrap(); + let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap(); let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal); // register Lindera tokenizer @@ -113,8 +109,8 @@ fn main() -> tantivy::Result<()> { let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; println!("Search Result:"); for (_, doc_address) in top_docs { - let retrieved_doc = searcher.doc(doc_address)?; - println!("{}", schema.to_json(&retrieved_doc)); + let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?; + println!("{}", retrieved_doc.to_json(&schema)); } Ok(()) diff --git a/examples/unidic.rs b/examples/unidic.rs index ef8e125..a1ce587 100644 --- a/examples/unidic.rs +++ b/examples/unidic.rs @@ -1,15 +1,11 @@ #[cfg(feature = "unidic")] fn main() -> tantivy::Result<()> { use tantivy::{ - collector::TopDocs, - doc, - query::QueryParser, - schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, - Index, + collector::TopDocs, doc, query::QueryParser, schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, Document, Index, TantivyDocument }; use lindera_core::mode::Mode; - use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind}; + use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind}; use lindera_tantivy::tokenizer::LinderaTokenizer; // create schema builder @@ -62,7 +58,7 @@ fn main() -> tantivy::Result<()> { kind: Some(DictionaryKind::UniDic), path: None, }; - let dictionary = load_dictionary_from_config(dictionary_config).unwrap(); + let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap(); let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal); // register Lindera tokenizer @@ -113,8 +109,8 @@ fn main() -> tantivy::Result<()> { let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; println!("Search Result:"); for (_, doc_address) in top_docs { - let retrieved_doc = searcher.doc(doc_address)?; - println!("{}", schema.to_json(&retrieved_doc)); + let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?; + println!("{}", retrieved_doc.to_json(&schema)); } Ok(()) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index d8bd608..cb81864 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -49,17 +49,16 @@ mod tests { use tantivy_tokenizer_api::{Token, TokenStream, Tokenizer}; use lindera_core::mode::Mode; - use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind}; + use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind}; use super::LinderaTokenizer; - #[cfg(feature = "ipadic")] - fn token_stream_helper_ipadic(text: &str) -> Vec { + fn token_stream_helper(text: &str, dictionary_kind: DictionaryKind) -> Vec { let dictionary_config = DictionaryConfig { - kind: Some(DictionaryKind::IPADIC), + kind: Some(dictionary_kind), path: None, }; - let dictionary = load_dictionary_from_config(dictionary_config).unwrap(); + let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap(); let mut tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal); let mut token_stream = tokenizer.token_stream(text); @@ -72,61 +71,24 @@ mod tests { tokens } + #[cfg(feature = "ipadic")] + fn token_stream_helper_ipadic(text: &str) -> Vec { + token_stream_helper(text, DictionaryKind::IPADIC) + } + #[cfg(feature = "unidic")] fn token_stream_helper_unidic(text: &str) -> Vec { - let dictionary_config = DictionaryConfig { - kind: Some(DictionaryKind::UniDic), - path: None, - }; - let dictionary = load_dictionary_from_config(dictionary_config).unwrap(); - let mut tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal); - - let mut token_stream = tokenizer.token_stream(text); - let mut tokens: Vec = vec![]; - let mut add_token = |token: &Token| { - tokens.push(token.clone()); - }; - token_stream.process(&mut add_token); - - tokens + token_stream_helper(text, DictionaryKind::UniDic) } #[cfg(feature = "ko-dic")] fn token_stream_helper_kodic(text: &str) -> Vec { - let dictionary_config = DictionaryConfig { - kind: Some(DictionaryKind::KoDic), - path: None, - }; - let dictionary = load_dictionary_from_config(dictionary_config).unwrap(); - let mut tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal); - - let mut token_stream = tokenizer.token_stream(text); - let mut tokens: Vec = vec![]; - let mut add_token = |token: &Token| { - tokens.push(token.clone()); - }; - token_stream.process(&mut add_token); - - tokens + token_stream_helper(text, DictionaryKind::KoDic) } #[cfg(feature = "cc-cedict")] fn token_stream_helper_cccedict(text: &str) -> Vec { - let dictionary_config = DictionaryConfig { - kind: Some(DictionaryKind::CcCedict), - path: None, - }; - let dictionary = load_dictionary_from_config(dictionary_config).unwrap(); - let mut tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal); - - let mut token_stream = tokenizer.token_stream(text); - let mut tokens: Vec = vec![]; - let mut add_token = |token: &Token| { - tokens.push(token.clone()); - }; - token_stream.process(&mut add_token); - - tokens + token_stream_helper(text, DictionaryKind::CcCedict) } /// This is a function that can be used in tests and doc tests