Update Lindera to 0.38.0 (#85)

lindera · Nov 19, 2024 · bfc893c · bfc893c
1 parent e276de1
commit bfc893c
Show file tree

Hide file tree

Showing 13 changed files with 463 additions and 225 deletions.
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -8,18 +8,19 @@ on:
 
 jobs:
   create-release:
-    name: Create Release
+    name: Upload artifact
     runs-on: ubuntu-latest
     steps:
       - id: create-release
-        uses: actions/create-release@v1.0.0
+        uses: softprops/action-gh-release@v2
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:
+          name: Release ${{ github.ref_name }}
           tag_name: ${{ github.ref }}
-          release_name: Release ${{ github.ref }}
           draft: false
           prerelease: false
+          generate_release_notes: true
 
   publish-crates:
     name: Publish crate

diff --git a/CHANGES.md b/CHANGES.md
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,29 +1,30 @@
 [package]
 name = "lindera-tantivy"
-version = "0.32.2"
+version = "0.38.0"
 edition = "2021"
 description = "Lindera Tokenizer for Tantivy."
 documentation = "https://docs.rs/lindera-tantivy"
-homepage = "https://github.com/lindera-morphology/lindera-tantivy"
-repository = "https://github.com/lindera-morphology/lindera-tantivy"
+homepage = "https://github.com/lindera/lindera-tantivy"
+repository = "https://github.com/lindera/lindera-tantivy"
 readme = "README.md"
 keywords = ["tokenizer", "tantivy", "lindera"]
 categories = ["text-processing"]
 license = "MIT"
 
 [features]
-default = []
-ipadic = ["lindera-tokenizer/ipadic"]  # Japanese dictionary
-unidic = ["lindera-tokenizer/unidic"]  # Japanese dictionary
-ko-dic = ["lindera-tokenizer/ko-dic"]  # Korean dictionary
-cc-cedict = ["lindera-tokenizer/cc-cedict"]  # Chinese dictionary
+default = []  # No directories included
+ipadic = ["lindera/ipadic"]  # Include IPADIC dictionary (Japanese)
+ipadic-neologd = ["lindera/ipadic-neologd"]  # Include IPADIC NEologd dictionary (Japanese)
+unidic = ["lindera/unidic"]  # Include UniDic dictionary (Japanese)
+ko-dic = ["lindera/ko-dic"]  # Include ko-dic dictionary (Korean)
+cc-cedict = ["lindera/cc-cedict"]  # Include CC-CEDICT dictionary (Chinese)
+compress = ["lindera/compress"]  # Compress dictionaries
 
 [dependencies]
 tantivy-tokenizer-api = "0.3.0"
+tantivy = "0.22.0"
 
-lindera-core = "0.32.2"
-lindera-dictionary = "0.32.2"
-lindera-tokenizer = "0.32.2"
+lindera = "0.38.0"
 
 [dev-dependencies]
 criterion = { version = "0.5.1", features = ["html_reports"] }

diff --git a/README.md b/README.md
@@ -12,21 +12,24 @@ The following example enables IPADIC.
 
 ```
 [dependencies]
-lindera-core = "0.32.2"
-lindera-dictionary = "0.32.2"
-lindera-tantivy = { version = "0.32.2", features = ["ipadic"] }
+lindera = "0.38"
+lindera-tantivy = { version = "0.38.0", features = ["ipadic"] }
 ```
 
 ### Basic example
 
 ```rust
 fn main() -> tantivy::Result<()> {
     use tantivy::{
-        collector::TopDocs, doc, query::QueryParser, schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, Document, Index, TantivyDocument
+        collector::TopDocs,
+        doc,
+        query::QueryParser,
+        schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions},
+        Document, Index, TantivyDocument,
     };
 
-    use lindera_core::mode::Mode;
-    use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind};
+    use lindera::dictionary::DictionaryKind;
+    use lindera::{dictionary::load_dictionary_from_kind, mode::Mode, segmenter::Segmenter};
     use lindera_tantivy::tokenizer::LinderaTokenizer;
 
     // create schema builder
@@ -75,12 +78,11 @@ fn main() -> tantivy::Result<()> {
     let index = Index::create_in_ram(schema.clone());
 
     // Tokenizer with IPADIC
-    let dictionary_config = DictionaryConfig {
-        kind: Some(DictionaryKind::IPADIC),
-        path: None,
-    };
-    let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap();
-    let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);
+    let mode = Mode::Normal;
+    let dictionary = load_dictionary_from_kind(DictionaryKind::IPADIC).unwrap();
+    let user_dictionary = None;
+    let segmenter = Segmenter::new(mode, dictionary, user_dictionary);
+    let tokenizer = LinderaTokenizer::from_segmenter(segmenter);
 
     // register Lindera tokenizer
     index.tokenizers().register("lang_ja", tokenizer);
@@ -138,6 +140,130 @@ fn main() -> tantivy::Result<()> {
 }
 ```
 
+### Config by YAML
+
+```rust
+use std::path::PathBuf;
+
+fn main() -> tantivy::Result<()> {
+    use tantivy::{
+        collector::TopDocs,
+        doc,
+        query::QueryParser,
+        schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions},
+        Document, Index, TantivyDocument,
+    };
+
+    use lindera_tantivy::tokenizer::LinderaTokenizer;
+
+    // create schema builder
+    let mut schema_builder = Schema::builder();
+
+    // add id field
+    let id = schema_builder.add_text_field(
+        "id",
+        TextOptions::default()
+            .set_indexing_options(
+                TextFieldIndexing::default()
+                    .set_tokenizer("raw")
+                    .set_index_option(IndexRecordOption::Basic),
+            )
+            .set_stored(),
+    );
+
+    // add title field
+    let title = schema_builder.add_text_field(
+        "title",
+        TextOptions::default()
+            .set_indexing_options(
+                TextFieldIndexing::default()
+                    .set_tokenizer("lang_ja")
+                    .set_index_option(IndexRecordOption::WithFreqsAndPositions),
+            )
+            .set_stored(),
+    );
+
+    // add body field
+    let body = schema_builder.add_text_field(
+        "body",
+        TextOptions::default()
+            .set_indexing_options(
+                TextFieldIndexing::default()
+                    .set_tokenizer("lang_ja")
+                    .set_index_option(IndexRecordOption::WithFreqsAndPositions),
+            )
+            .set_stored(),
+    );
+
+    // build schema
+    let schema = schema_builder.build();
+
+    // create index on memory
+    let index = Index::create_in_ram(schema.clone());
+
+    // Build tokenizer with config file
+    let config_file = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+        .join("./examples")
+        .join("lindera.yml");
+    let tokenizer = LinderaTokenizer::from_file(config_file.as_path())?;
+
+    // register Lindera tokenizer
+    index.tokenizers().register("lang_ja", tokenizer);
+
+    // create index writer
+    let mut index_writer = index.writer(50_000_000)?;
+
+    // add document
+    index_writer.add_document(doc!(
+    id => "1",
+    title => "成田国際空港",
+    body => "成田国際空港（なりたこくさいくうこう、英: Narita International Airport）は、千葉県成田市南東部から芝山町北部にかけて建設された日本最大の国際拠点空港である。首都圏東部（東京の東60km）に位置している。空港コードはNRT。"
+    )).unwrap();
+
+    // add document
+    index_writer.add_document(doc!(
+    id => "2",
+    title => "東京国際空港",
+    body => "東京国際空港（とうきょうこくさいくうこう、英語: Tokyo International Airport）は、東京都大田区にある日本最大の空港。通称は羽田空港（はねだくうこう、英語: Haneda Airport）であり、単に「羽田」と呼ばれる場合もある。空港コードはHND。"
+    )).unwrap();
+
+    // add document
+    index_writer.add_document(doc!(
+    id => "3",
+    title => "関西国際空港",
+    body => "関西国際空港（かんさいこくさいくうこう、英: Kansai International Airport）は大阪市の南西35㎞に位置する西日本の国際的な玄関口であり、関西三空港の一つとして大阪国際空港（伊丹空港）、神戸空港とともに関西エアポート株式会社によって一体運営が行われている。"
+    )).unwrap();
+
+    // commit
+    index_writer.commit()?;
+
+    // create reader
+    let reader = index.reader()?;
+
+    // create searcher
+    let searcher = reader.searcher();
+
+    // create querhy parser
+    let query_parser = QueryParser::for_index(&index, vec![title, body]);
+
+    // parse query
+    let query_str = "ＴＯＫＹＯ";
+    let query = query_parser.parse_query(query_str)?;
+    println!("Query String: {}", query_str);
+
+    // search
+    println!("Parsed Query: {:?}", query);
+    let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
+    println!("Search Result:");
+    for (_, doc_address) in top_docs {
+        let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
+        println!("{}", retrieved_doc.to_json(&schema));
+    }
+
+    Ok(())
+}
+```
+
 ## API reference
 
 The API reference is available. Please see following URL: