From e276de1a405c95a8efd61d89ca4568fedfa80086 Mon Sep 17 00:00:00 2001
From: Lolepopie <8401103+lolepop@users.noreply.github.com>
Date: Tue, 19 Nov 2024 16:13:04 +0800
Subject: [PATCH] Update Lindera to 0.32.2 and Tantivy to 0.22.0 (#84)

* update lindera to 0.32.2 and tantivy to 0.22.0

* update readme example
---
 Cargo.toml            | 16 +++++------
 README.md             | 32 ++++++++++------------
 benches/bench.rs      |  4 +--
 examples/cc-cedict.rs | 14 ++++------
 examples/ipadic.rs    | 14 ++++------
 examples/ko-dic.rs    | 14 ++++------
 examples/unidic.rs    | 14 ++++------
 src/tokenizer.rs      | 62 +++++++++----------------------------------
 8 files changed, 54 insertions(+), 116 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 49ebedc..f72d289 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lindera-tantivy"
-version = "0.27.1"
+version = "0.32.2"
 edition = "2021"
 description = "Lindera Tokenizer for Tantivy."
 documentation = "https://docs.rs/lindera-tantivy"
@@ -17,21 +17,17 @@ ipadic = ["lindera-tokenizer/ipadic"]  # Japanese dictionary
 unidic = ["lindera-tokenizer/unidic"]  # Japanese dictionary
 ko-dic = ["lindera-tokenizer/ko-dic"]  # Korean dictionary
 cc-cedict = ["lindera-tokenizer/cc-cedict"]  # Chinese dictionary
-ipadic-compress = ["lindera-tokenizer/ipadic-compress"]
-unidic-compress = ["lindera-tokenizer/unidic-compress"]
-ko-dic-compress = ["lindera-tokenizer/ko-dic-compress"]
-cc-cedict-compress = ["lindera-tokenizer/cc-cedict-compress"]
 
 [dependencies]
-tantivy-tokenizer-api = "0.2.0"
+tantivy-tokenizer-api = "0.3.0"
 
-lindera-core = "0.27.0"
-lindera-dictionary = "0.27.0"
-lindera-tokenizer = "0.27.0"
+lindera-core = "0.32.2"
+lindera-dictionary = "0.32.2"
+lindera-tokenizer = "0.32.2"
 
 [dev-dependencies]
 criterion = { version = "0.5.1", features = ["html_reports"] }
-tantivy = "0.21.1"
+tantivy = "0.22.0"
 
 [[bench]]
 name = "bench"
diff --git a/README.md b/README.md
index c7f5362..40e8da6 100644
--- a/README.md
+++ b/README.md
@@ -12,27 +12,23 @@ The following example enables IPADIC.
 
 ```
 [dependencies]
-lindera-core = "0.24.0"
-lindera-dictionary = "0.24.0"
-lindera-tantivy = { version = "0.24.0", features = ["ipadic"] }
+lindera-core = "0.32.2"
+lindera-dictionary = "0.32.2"
+lindera-tantivy = { version = "0.32.2", features = ["ipadic"] }
 ```
 
 ### Basic example
 
 ```rust
-use tantivy::{
-    collector::TopDocs,
-    doc,
-    query::QueryParser,
-    schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions},
-    Index,
-};
-
-use lindera_core::mode::Mode;
-use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind};
-use lindera_tantivy::tokenizer::LinderaTokenizer;
-
 fn main() -> tantivy::Result<()> {
+    use tantivy::{
+        collector::TopDocs, doc, query::QueryParser, schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, Document, Index, TantivyDocument
+    };
+
+    use lindera_core::mode::Mode;
+    use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind};
+    use lindera_tantivy::tokenizer::LinderaTokenizer;
+
     // create schema builder
     let mut schema_builder = Schema::builder();
 
@@ -83,7 +79,7 @@ fn main() -> tantivy::Result<()> {
         kind: Some(DictionaryKind::IPADIC),
         path: None,
     };
-    let dictionary = load_dictionary(dictionary_config).unwrap();
+    let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap();
     let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);
 
     // register Lindera tokenizer
@@ -134,8 +130,8 @@ fn main() -> tantivy::Result<()> {
     let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
     println!("Search Result:");
     for (_, doc_address) in top_docs {
-        let retrieved_doc = searcher.doc(doc_address)?;
-        println!("{}", schema.to_json(&retrieved_doc));
+        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
+        println!("{}", retrieved_doc.to_json(&schema));
     }
 
     Ok(())
diff --git a/benches/bench.rs b/benches/bench.rs
index 653a19e..0b1cc26 100644
--- a/benches/bench.rs
+++ b/benches/bench.rs
@@ -8,7 +8,7 @@ fn bench_indexing(c: &mut Criterion) {
     use tantivy::Index;
 
     use lindera_core::mode::Mode;
-    use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind};
+    use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind};
     use lindera_tantivy::tokenizer::LinderaTokenizer;
 
     // create schema builder
@@ -58,7 +58,7 @@ fn bench_indexing(c: &mut Criterion) {
         kind: Some(DictionaryKind::IPADIC),
         path: None,
     };
-    let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
+    let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap();
     let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);
 
     // register Lindera tokenizer
diff --git a/examples/cc-cedict.rs b/examples/cc-cedict.rs
index 93e4bcd..8ebc94e 100644
--- a/examples/cc-cedict.rs
+++ b/examples/cc-cedict.rs
@@ -1,15 +1,11 @@
 #[cfg(feature = "cc-cedict")]
 fn main() -> tantivy::Result<()> {
     use tantivy::{
-        collector::TopDocs,
-        doc,
-        query::QueryParser,
-        schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions},
-        Index,
+        collector::TopDocs, doc, query::QueryParser, schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, Document, Index, TantivyDocument
     };
 
     use lindera_core::mode::Mode;
-    use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind};
+    use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind};
     use lindera_tantivy::tokenizer::LinderaTokenizer;
 
     // create schema builder
@@ -62,7 +58,7 @@ fn main() -> tantivy::Result<()> {
         kind: Some(DictionaryKind::CcCedict),
         path: None,
     };
-    let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
+    let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap();
     let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);
 
     // register Lindera tokenizer
@@ -113,8 +109,8 @@ fn main() -> tantivy::Result<()> {
     let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
     println!("Search Result:");
     for (_, doc_address) in top_docs {
-        let retrieved_doc = searcher.doc(doc_address)?;
-        println!("{}", schema.to_json(&retrieved_doc));
+        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
+        println!("{}", retrieved_doc.to_json(&schema));
     }
 
     Ok(())
diff --git a/examples/ipadic.rs b/examples/ipadic.rs
index 5717209..332388d 100644
--- a/examples/ipadic.rs
+++ b/examples/ipadic.rs
@@ -1,15 +1,11 @@
 #[cfg(feature = "ipadic")]
 fn main() -> tantivy::Result<()> {
     use tantivy::{
-        collector::TopDocs,
-        doc,
-        query::QueryParser,
-        schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions},
-        Index,
+        collector::TopDocs, doc, query::QueryParser, schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, Document, Index, TantivyDocument
     };
 
     use lindera_core::mode::Mode;
-    use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind};
+    use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind};
     use lindera_tantivy::tokenizer::LinderaTokenizer;
 
     // create schema builder
@@ -62,7 +58,7 @@ fn main() -> tantivy::Result<()> {
         kind: Some(DictionaryKind::IPADIC),
         path: None,
     };
-    let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
+    let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap();
     let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);
 
     // register Lindera tokenizer
@@ -113,8 +109,8 @@ fn main() -> tantivy::Result<()> {
     let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
     println!("Search Result:");
     for (_, doc_address) in top_docs {
-        let retrieved_doc = searcher.doc(doc_address)?;
-        println!("{}", schema.to_json(&retrieved_doc));
+        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
+        println!("{}", retrieved_doc.to_json(&schema));
     }
 
     Ok(())
diff --git a/examples/ko-dic.rs b/examples/ko-dic.rs
index 7a009e7..9ab5c09 100644
--- a/examples/ko-dic.rs
+++ b/examples/ko-dic.rs
@@ -1,15 +1,11 @@
 #[cfg(feature = "ko-dic")]
 fn main() -> tantivy::Result<()> {
     use tantivy::{
-        collector::TopDocs,
-        doc,
-        query::QueryParser,
-        schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions},
-        Index,
+        collector::TopDocs, doc, query::QueryParser, schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, Document, Index, TantivyDocument
     };
 
     use lindera_core::mode::Mode;
-    use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind};
+    use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind};
     use lindera_tantivy::tokenizer::LinderaTokenizer;
 
     // create schema builder
@@ -62,7 +58,7 @@ fn main() -> tantivy::Result<()> {
         kind: Some(DictionaryKind::KoDic),
         path: None,
     };
-    let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
+    let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap();
     let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);
 
     // register Lindera tokenizer
@@ -113,8 +109,8 @@ fn main() -> tantivy::Result<()> {
     let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
     println!("Search Result:");
     for (_, doc_address) in top_docs {
-        let retrieved_doc = searcher.doc(doc_address)?;
-        println!("{}", schema.to_json(&retrieved_doc));
+        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
+        println!("{}", retrieved_doc.to_json(&schema));
     }
 
     Ok(())
diff --git a/examples/unidic.rs b/examples/unidic.rs
index ef8e125..a1ce587 100644
--- a/examples/unidic.rs
+++ b/examples/unidic.rs
@@ -1,15 +1,11 @@
 #[cfg(feature = "unidic")]
 fn main() -> tantivy::Result<()> {
     use tantivy::{
-        collector::TopDocs,
-        doc,
-        query::QueryParser,
-        schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions},
-        Index,
+        collector::TopDocs, doc, query::QueryParser, schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, Document, Index, TantivyDocument
     };
 
     use lindera_core::mode::Mode;
-    use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind};
+    use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind};
     use lindera_tantivy::tokenizer::LinderaTokenizer;
 
     // create schema builder
@@ -62,7 +58,7 @@ fn main() -> tantivy::Result<()> {
         kind: Some(DictionaryKind::UniDic),
         path: None,
     };
-    let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
+    let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap();
     let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);
 
     // register Lindera tokenizer
@@ -113,8 +109,8 @@ fn main() -> tantivy::Result<()> {
     let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
     println!("Search Result:");
     for (_, doc_address) in top_docs {
-        let retrieved_doc = searcher.doc(doc_address)?;
-        println!("{}", schema.to_json(&retrieved_doc));
+        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
+        println!("{}", retrieved_doc.to_json(&schema));
     }
 
     Ok(())
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index d8bd608..cb81864 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -49,17 +49,16 @@ mod tests {
     use tantivy_tokenizer_api::{Token, TokenStream, Tokenizer};
 
     use lindera_core::mode::Mode;
-    use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind};
+    use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind};
 
     use super::LinderaTokenizer;
 
-    #[cfg(feature = "ipadic")]
-    fn token_stream_helper_ipadic(text: &str) -> Vec<Token> {
+    fn token_stream_helper(text: &str, dictionary_kind: DictionaryKind) -> Vec<Token> {
         let dictionary_config = DictionaryConfig {
-            kind: Some(DictionaryKind::IPADIC),
+            kind: Some(dictionary_kind),
             path: None,
         };
-        let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
+        let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap();
         let mut tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);
 
         let mut token_stream = tokenizer.token_stream(text);
@@ -72,61 +71,24 @@ mod tests {
         tokens
     }
 
+    #[cfg(feature = "ipadic")]
+    fn token_stream_helper_ipadic(text: &str) -> Vec<Token> {
+        token_stream_helper(text, DictionaryKind::IPADIC)
+    }
+
     #[cfg(feature = "unidic")]
     fn token_stream_helper_unidic(text: &str) -> Vec<Token> {
-        let dictionary_config = DictionaryConfig {
-            kind: Some(DictionaryKind::UniDic),
-            path: None,
-        };
-        let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
-        let mut tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);
-
-        let mut token_stream = tokenizer.token_stream(text);
-        let mut tokens: Vec<Token> = vec![];
-        let mut add_token = |token: &Token| {
-            tokens.push(token.clone());
-        };
-        token_stream.process(&mut add_token);
-
-        tokens
+        token_stream_helper(text, DictionaryKind::UniDic)
     }
 
     #[cfg(feature = "ko-dic")]
     fn token_stream_helper_kodic(text: &str) -> Vec<Token> {
-        let dictionary_config = DictionaryConfig {
-            kind: Some(DictionaryKind::KoDic),
-            path: None,
-        };
-        let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
-        let mut tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);
-
-        let mut token_stream = tokenizer.token_stream(text);
-        let mut tokens: Vec<Token> = vec![];
-        let mut add_token = |token: &Token| {
-            tokens.push(token.clone());
-        };
-        token_stream.process(&mut add_token);
-
-        tokens
+        token_stream_helper(text, DictionaryKind::KoDic)
     }
 
     #[cfg(feature = "cc-cedict")]
     fn token_stream_helper_cccedict(text: &str) -> Vec<Token> {
-        let dictionary_config = DictionaryConfig {
-            kind: Some(DictionaryKind::CcCedict),
-            path: None,
-        };
-        let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
-        let mut tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);
-
-        let mut token_stream = tokenizer.token_stream(text);
-        let mut tokens: Vec<Token> = vec![];
-        let mut add_token = |token: &Token| {
-            tokens.push(token.clone());
-        };
-        token_stream.process(&mut add_token);
-
-        tokens
+        token_stream_helper(text, DictionaryKind::CcCedict)
     }
 
     /// This is a function that can be used in tests and doc tests