diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 55a1c9b..0460be3 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -8,18 +8,19 @@ on: jobs: create-release: - name: Create Release + name: Upload artifact runs-on: ubuntu-latest steps: - id: create-release - uses: actions/create-release@v1.0.0 + uses: softprops/action-gh-release@v2 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: + name: Release ${{ github.ref_name }} tag_name: ${{ github.ref }} - release_name: Release ${{ github.ref }} draft: false prerelease: false + generate_release_notes: true publish-crates: name: Publish crate diff --git a/CHANGES.md b/CHANGES.md deleted file mode 100644 index f257b4d..0000000 --- a/CHANGES.md +++ /dev/null @@ -1,133 +0,0 @@ -# Release notes -All notable changes to this project will be documented in this file. -This project adheres to [Semantic Versioning](http://semver.org/). - -## 0.27.1 (2023-12-02) -- Bump up version to 0.27.1 #83 @mosuka -- Update tantivy #82 @mosuka - -## 0.27.0 (2023-07-10) -- Update Lindera to 0.27.0 #80 @mosuka - -## 0.25.1 (2023-06-27) -- Use tokenizer api crate. #78 @massot - -## 0.25.0 (2023-06-19) -- Update dependencies #77 @mosuka - -## 0.24.0 (2023-04-30) -- Update Lindera #76 @mosuka - -## 0.23.1 (2023-04-07) -- Update Lindera #74 @mosuka - -## 0.23.0 (2023-02-23) -- Update dependencies #72 - -## 0.21.0 (2023-01-23) -- Use Lindera Analyzer instead of Lindera Tokenizer #68 @mosuka - -## 0.20.0 (2023-01-16) -- Update dependencies #67 @mosuka - -## 0.19.1 (2023-01-08) -- Fix build errors #66 @mosuka - -## 0.19.0 (2022-12-19) -- Update Lindera to 0.19.0 #62 @mosuka - -## 0.18.0 (2022-10-27) -- Update dependencies #59 @mosuka - -## 0.16.2 (2022-09-20) -- Update Lindera to 0.16.2 #56 @mosuka - -## 0.16.0 (2022-09-12) -- Update Lindera to 0.16.0 #54 @mosuka - -## 0.14.0 (2022-07-02) -- Update Lindera to 0.14.0 #53 @mosuka - -## 0.13.5 (2022-06-13) -- Upgrade Tantivy to 0.18.0 #52 @mosuka - -## 0.13.4 (2022-05-10) -- Update Lindera to 0.13.5 #50 @mosuka - -## 0.13.3 (2022-04-08) -- Add compress feature #49 @mosuka - -## 0.13.2 (2022-04-08) -- Update Lindera to 0.13.2 #48 @mosuka - -## 0.13.1 (2022-04-08) -- Update Lindera to 0.13.1 #47 @mosuka - -## 0.13.0 (2022-04-07) -- Update Lindera to 0.13.0 #46 @mosuka - -## 0.12.5 (2022-04-06) -- Bump Lindera to 0.12.6 #45 @mosuka - -## 0.12.4 (2022-04-04) -- Update lindera to 0.12.5 #44 @mosuka - -## 0.12.3 (2022-04-04) -- Update lindera to 0.12.4 #43 @mosuka - -## 0.12.2 (2022-04-01) -- Update Lindera to 0.12.2 #42 @mosuka - -## 0.12.0 (2022-03-17) -- Update Lindera to 0.12.0 #41 @mosuka - -## 0.11.1 (2022-03-09) -- Update Lindera and Tantivy #37 @mosuka - -## 0.10.0 (2022-02-25) -- Update lindera to 0.10.0 #32 @mosuka - -## 0.9.0 (2022-02-20) -- Update Lindera to v0.9.0 #30 @mosuka -- Use RwLock instead of cloning tokenizer #27 @vbkaisetsu - -## 0.8.1 (2021-11-13) -- Update lindera to 0.8.1 #26 @mosuka -- Update tantivy requirement from 0.15 to 0.16 #25 - -## 0.8.0 (2021-08-22) -- Bump up version to 0.8.0. #23 @mosuka -- Update Lindera to 0.8 and Tantivy to 0.15. #22 @mosuka - -## 0.7.2 (2021-02-08) -- Upgrade Tantivy to 0.14.0 #19 @mosuka -- Bump up version to 0.7.2 #21 @mosuka - -## 0.7.1 (2020-10-15) -- Bump up version to 0.7.1 #17 @mosuka - -## 0.7.0 (2020-10-12) -- Bump up version to 0.7.0 #14 @mosuka -- Upgrade dependencies #13 @mosuka - -## 0.6.0 (2020-10-08) -- Bump up version to 0.6.0 #12 @mosuka -- Add GitHub Actions integration #11 mosuka -- Upgrade lindera 0.6.0 and tantivy 0.13.2 @johtani - -## 0.2.0 (2020-08-25) -- Update Makefile #8 @mosuka -- Bump up version (tantivy 0.13.0) #7 @ken0x0a - -## 0.1.3 (2020-05-30) -- Bump up version #6 @mosuka -- Update dependencies #5 @mosuka - -## 0.1.2 (2020-05-22) -- Update dependencies #4 @mosuka - -## 0.1.1 (2020-05-01) -- Update dependencies #3 @mosuka - -## 0.1.0 (2020-02-25) -- First release @mosuka diff --git a/Cargo.toml b/Cargo.toml index f72d289..70dd80d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,29 +1,30 @@ [package] name = "lindera-tantivy" -version = "0.32.2" +version = "0.38.0" edition = "2021" description = "Lindera Tokenizer for Tantivy." documentation = "https://docs.rs/lindera-tantivy" -homepage = "https://github.com/lindera-morphology/lindera-tantivy" -repository = "https://github.com/lindera-morphology/lindera-tantivy" +homepage = "https://github.com/lindera/lindera-tantivy" +repository = "https://github.com/lindera/lindera-tantivy" readme = "README.md" keywords = ["tokenizer", "tantivy", "lindera"] categories = ["text-processing"] license = "MIT" [features] -default = [] -ipadic = ["lindera-tokenizer/ipadic"] # Japanese dictionary -unidic = ["lindera-tokenizer/unidic"] # Japanese dictionary -ko-dic = ["lindera-tokenizer/ko-dic"] # Korean dictionary -cc-cedict = ["lindera-tokenizer/cc-cedict"] # Chinese dictionary +default = [] # No directories included +ipadic = ["lindera/ipadic"] # Include IPADIC dictionary (Japanese) +ipadic-neologd = ["lindera/ipadic-neologd"] # Include IPADIC NEologd dictionary (Japanese) +unidic = ["lindera/unidic"] # Include UniDic dictionary (Japanese) +ko-dic = ["lindera/ko-dic"] # Include ko-dic dictionary (Korean) +cc-cedict = ["lindera/cc-cedict"] # Include CC-CEDICT dictionary (Chinese) +compress = ["lindera/compress"] # Compress dictionaries [dependencies] tantivy-tokenizer-api = "0.3.0" +tantivy = "0.22.0" -lindera-core = "0.32.2" -lindera-dictionary = "0.32.2" -lindera-tokenizer = "0.32.2" +lindera = "0.38.0" [dev-dependencies] criterion = { version = "0.5.1", features = ["html_reports"] } diff --git a/README.md b/README.md index 40e8da6..c2e9e66 100644 --- a/README.md +++ b/README.md @@ -12,9 +12,8 @@ The following example enables IPADIC. ``` [dependencies] -lindera-core = "0.32.2" -lindera-dictionary = "0.32.2" -lindera-tantivy = { version = "0.32.2", features = ["ipadic"] } +lindera = "0.38" +lindera-tantivy = { version = "0.38.0", features = ["ipadic"] } ``` ### Basic example @@ -22,11 +21,15 @@ lindera-tantivy = { version = "0.32.2", features = ["ipadic"] } ```rust fn main() -> tantivy::Result<()> { use tantivy::{ - collector::TopDocs, doc, query::QueryParser, schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, Document, Index, TantivyDocument + collector::TopDocs, + doc, + query::QueryParser, + schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, + Document, Index, TantivyDocument, }; - use lindera_core::mode::Mode; - use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind}; + use lindera::dictionary::DictionaryKind; + use lindera::{dictionary::load_dictionary_from_kind, mode::Mode, segmenter::Segmenter}; use lindera_tantivy::tokenizer::LinderaTokenizer; // create schema builder @@ -75,12 +78,11 @@ fn main() -> tantivy::Result<()> { let index = Index::create_in_ram(schema.clone()); // Tokenizer with IPADIC - let dictionary_config = DictionaryConfig { - kind: Some(DictionaryKind::IPADIC), - path: None, - }; - let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap(); - let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal); + let mode = Mode::Normal; + let dictionary = load_dictionary_from_kind(DictionaryKind::IPADIC).unwrap(); + let user_dictionary = None; + let segmenter = Segmenter::new(mode, dictionary, user_dictionary); + let tokenizer = LinderaTokenizer::from_segmenter(segmenter); // register Lindera tokenizer index.tokenizers().register("lang_ja", tokenizer); @@ -138,6 +140,130 @@ fn main() -> tantivy::Result<()> { } ``` +### Config by YAML + +```rust +use std::path::PathBuf; + +fn main() -> tantivy::Result<()> { + use tantivy::{ + collector::TopDocs, + doc, + query::QueryParser, + schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, + Document, Index, TantivyDocument, + }; + + use lindera_tantivy::tokenizer::LinderaTokenizer; + + // create schema builder + let mut schema_builder = Schema::builder(); + + // add id field + let id = schema_builder.add_text_field( + "id", + TextOptions::default() + .set_indexing_options( + TextFieldIndexing::default() + .set_tokenizer("raw") + .set_index_option(IndexRecordOption::Basic), + ) + .set_stored(), + ); + + // add title field + let title = schema_builder.add_text_field( + "title", + TextOptions::default() + .set_indexing_options( + TextFieldIndexing::default() + .set_tokenizer("lang_ja") + .set_index_option(IndexRecordOption::WithFreqsAndPositions), + ) + .set_stored(), + ); + + // add body field + let body = schema_builder.add_text_field( + "body", + TextOptions::default() + .set_indexing_options( + TextFieldIndexing::default() + .set_tokenizer("lang_ja") + .set_index_option(IndexRecordOption::WithFreqsAndPositions), + ) + .set_stored(), + ); + + // build schema + let schema = schema_builder.build(); + + // create index on memory + let index = Index::create_in_ram(schema.clone()); + + // Build tokenizer with config file + let config_file = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("./examples") + .join("lindera.yml"); + let tokenizer = LinderaTokenizer::from_file(config_file.as_path())?; + + // register Lindera tokenizer + index.tokenizers().register("lang_ja", tokenizer); + + // create index writer + let mut index_writer = index.writer(50_000_000)?; + + // add document + index_writer.add_document(doc!( + id => "1", + title => "成田国際空港", + body => "成田国際空港(なりたこくさいくうこう、英: Narita International Airport)は、千葉県成田市南東部から芝山町北部にかけて建設された日本最大の国際拠点空港である。首都圏東部(東京の東60km)に位置している。空港コードはNRT。" + )).unwrap(); + + // add document + index_writer.add_document(doc!( + id => "2", + title => "東京国際空港", + body => "東京国際空港(とうきょうこくさいくうこう、英語: Tokyo International Airport)は、東京都大田区にある日本最大の空港。通称は羽田空港(はねだくうこう、英語: Haneda Airport)であり、単に「羽田」と呼ばれる場合もある。空港コードはHND。" + )).unwrap(); + + // add document + index_writer.add_document(doc!( + id => "3", + title => "関西国際空港", + body => "関西国際空港(かんさいこくさいくうこう、英: Kansai International Airport)は大阪市の南西35㎞に位置する西日本の国際的な玄関口であり、関西三空港の一つとして大阪国際空港(伊丹空港)、神戸空港とともに関西エアポート株式会社によって一体運営が行われている。" + )).unwrap(); + + // commit + index_writer.commit()?; + + // create reader + let reader = index.reader()?; + + // create searcher + let searcher = reader.searcher(); + + // create querhy parser + let query_parser = QueryParser::for_index(&index, vec![title, body]); + + // parse query + let query_str = "TOKYO"; + let query = query_parser.parse_query(query_str)?; + println!("Query String: {}", query_str); + + // search + println!("Parsed Query: {:?}", query); + let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; + println!("Search Result:"); + for (_, doc_address) in top_docs { + let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?; + println!("{}", retrieved_doc.to_json(&schema)); + } + + Ok(()) +} +``` + ## API reference The API reference is available. Please see following URL: diff --git a/benches/bench.rs b/benches/bench.rs index 0b1cc26..12e8a36 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -3,12 +3,14 @@ use criterion::{criterion_group, criterion_main}; #[cfg(feature = "ipadic")] fn bench_indexing(c: &mut Criterion) { + use lindera::dictionary::load_dictionary_from_kind; + use lindera::segmenter::Segmenter; use tantivy::doc; use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}; use tantivy::Index; - use lindera_core::mode::Mode; - use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind}; + use lindera::dictionary::DictionaryKind; + use lindera::mode::Mode; use lindera_tantivy::tokenizer::LinderaTokenizer; // create schema builder @@ -54,12 +56,11 @@ fn bench_indexing(c: &mut Criterion) { docs.push(doc); } - let dictionary_config = DictionaryConfig { - kind: Some(DictionaryKind::IPADIC), - path: None, - }; - let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap(); - let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal); + let mode = Mode::Normal; + let dictionary = load_dictionary_from_kind(DictionaryKind::IPADIC).unwrap(); + let user_dictionary = None; + let segmenter = Segmenter::new(mode, dictionary, user_dictionary); + let tokenizer = LinderaTokenizer::from_segmenter(segmenter); // register Lindera tokenizer index.tokenizers().register("lang_ja", tokenizer); diff --git a/examples/cc-cedict.rs b/examples/cc-cedict.rs index 8ebc94e..6310a6d 100644 --- a/examples/cc-cedict.rs +++ b/examples/cc-cedict.rs @@ -1,11 +1,15 @@ #[cfg(feature = "cc-cedict")] fn main() -> tantivy::Result<()> { use tantivy::{ - collector::TopDocs, doc, query::QueryParser, schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, Document, Index, TantivyDocument + collector::TopDocs, + doc, + query::QueryParser, + schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, + Document, Index, TantivyDocument, }; - use lindera_core::mode::Mode; - use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind}; + use lindera::dictionary::DictionaryKind; + use lindera::{dictionary::load_dictionary_from_kind, mode::Mode, segmenter::Segmenter}; use lindera_tantivy::tokenizer::LinderaTokenizer; // create schema builder @@ -54,12 +58,11 @@ fn main() -> tantivy::Result<()> { let index = Index::create_in_ram(schema.clone()); // Tokenizer with CC-CEDICT - let dictionary_config = DictionaryConfig { - kind: Some(DictionaryKind::CcCedict), - path: None, - }; - let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap(); - let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal); + let mode = Mode::Normal; + let dictionary = load_dictionary_from_kind(DictionaryKind::CcCedict).unwrap(); + let user_dictionary = None; + let segmenter = Segmenter::new(mode, dictionary, user_dictionary); + let tokenizer = LinderaTokenizer::from_segmenter(segmenter); // register Lindera tokenizer index.tokenizers().register("lang_zh", tokenizer); diff --git a/examples/ipadic.rs b/examples/ipadic.rs index 332388d..d4c569f 100644 --- a/examples/ipadic.rs +++ b/examples/ipadic.rs @@ -1,11 +1,15 @@ #[cfg(feature = "ipadic")] fn main() -> tantivy::Result<()> { use tantivy::{ - collector::TopDocs, doc, query::QueryParser, schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, Document, Index, TantivyDocument + collector::TopDocs, + doc, + query::QueryParser, + schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, + Document, Index, TantivyDocument, }; - use lindera_core::mode::Mode; - use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind}; + use lindera::dictionary::DictionaryKind; + use lindera::{dictionary::load_dictionary_from_kind, mode::Mode, segmenter::Segmenter}; use lindera_tantivy::tokenizer::LinderaTokenizer; // create schema builder @@ -54,12 +58,11 @@ fn main() -> tantivy::Result<()> { let index = Index::create_in_ram(schema.clone()); // Tokenizer with IPADIC - let dictionary_config = DictionaryConfig { - kind: Some(DictionaryKind::IPADIC), - path: None, - }; - let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap(); - let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal); + let mode = Mode::Normal; + let dictionary = load_dictionary_from_kind(DictionaryKind::IPADIC).unwrap(); + let user_dictionary = None; + let segmenter = Segmenter::new(mode, dictionary, user_dictionary); + let tokenizer = LinderaTokenizer::from_segmenter(segmenter); // register Lindera tokenizer index.tokenizers().register("lang_ja", tokenizer); diff --git a/examples/ko-dic.rs b/examples/ko-dic.rs index 9ab5c09..235d816 100644 --- a/examples/ko-dic.rs +++ b/examples/ko-dic.rs @@ -1,11 +1,15 @@ #[cfg(feature = "ko-dic")] fn main() -> tantivy::Result<()> { use tantivy::{ - collector::TopDocs, doc, query::QueryParser, schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, Document, Index, TantivyDocument + collector::TopDocs, + doc, + query::QueryParser, + schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, + Document, Index, TantivyDocument, }; - use lindera_core::mode::Mode; - use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind}; + use lindera::dictionary::DictionaryKind; + use lindera::{dictionary::load_dictionary_from_kind, mode::Mode, segmenter::Segmenter}; use lindera_tantivy::tokenizer::LinderaTokenizer; // create schema builder @@ -54,12 +58,11 @@ fn main() -> tantivy::Result<()> { let index = Index::create_in_ram(schema.clone()); // Tokenizer with ko-dic - let dictionary_config = DictionaryConfig { - kind: Some(DictionaryKind::KoDic), - path: None, - }; - let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap(); - let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal); + let mode = Mode::Normal; + let dictionary = load_dictionary_from_kind(DictionaryKind::KoDic).unwrap(); + let user_dictionary = None; + let segmenter = Segmenter::new(mode, dictionary, user_dictionary); + let tokenizer = LinderaTokenizer::from_segmenter(segmenter); // register Lindera tokenizer index.tokenizers().register("lang_ko", tokenizer); diff --git a/examples/lindera.yml b/examples/lindera.yml new file mode 100644 index 0000000..ee7b33b --- /dev/null +++ b/examples/lindera.yml @@ -0,0 +1,69 @@ +segmenter: + mode: "normal" + dictionary: + kind: "ipadic" + # user_dictionary: + # path: "./resources/ipadic_simple.csv" + # kind: "ipadic" + +character_filters: + - kind: "unicode_normalize" + args: + kind: "nfkc" + - kind: "japanese_iteration_mark" + args: + normalize_kanji: true + normalize_kana: true + - kind: mapping + args: + mapping: + リンデラ: Lindera + +token_filters: + - kind: "japanese_compound_word" + args: + kind: "ipadic" + tags: + - "名詞,数" + - "名詞,接尾,助数詞" + new_tag: "名詞,数" + - kind: "japanese_number" + args: + tags: + - "名詞,数" + - kind: "japanese_stop_tags" + args: + tags: + - "接続詞" + - "助詞" + - "助詞,格助詞" + - "助詞,格助詞,一般" + - "助詞,格助詞,引用" + - "助詞,格助詞,連語" + - "助詞,係助詞" + - "助詞,副助詞" + - "助詞,間投助詞" + - "助詞,並立助詞" + - "助詞,終助詞" + - "助詞,副助詞/並立助詞/終助詞" + - "助詞,連体化" + - "助詞,副詞化" + - "助詞,特殊" + - "助動詞" + - "記号" + - "記号,一般" + - "記号,読点" + - "記号,句点" + - "記号,空白" + - "記号,括弧閉" + - "その他,間投" + - "フィラー" + - "非言語音" + - kind: "japanese_katakana_stem" + args: + min: 3 + - kind: "remove_diacritical_mark" + args: + japanese: false + - kind: "lowercase" + args: {} diff --git a/examples/tokenize_with_config.rs b/examples/tokenize_with_config.rs new file mode 100644 index 0000000..9d31653 --- /dev/null +++ b/examples/tokenize_with_config.rs @@ -0,0 +1,119 @@ +use std::path::PathBuf; + +fn main() -> tantivy::Result<()> { + use tantivy::{ + collector::TopDocs, + doc, + query::QueryParser, + schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, + Document, Index, TantivyDocument, + }; + + use lindera_tantivy::tokenizer::LinderaTokenizer; + + // create schema builder + let mut schema_builder = Schema::builder(); + + // add id field + let id = schema_builder.add_text_field( + "id", + TextOptions::default() + .set_indexing_options( + TextFieldIndexing::default() + .set_tokenizer("raw") + .set_index_option(IndexRecordOption::Basic), + ) + .set_stored(), + ); + + // add title field + let title = schema_builder.add_text_field( + "title", + TextOptions::default() + .set_indexing_options( + TextFieldIndexing::default() + .set_tokenizer("lang_ja") + .set_index_option(IndexRecordOption::WithFreqsAndPositions), + ) + .set_stored(), + ); + + // add body field + let body = schema_builder.add_text_field( + "body", + TextOptions::default() + .set_indexing_options( + TextFieldIndexing::default() + .set_tokenizer("lang_ja") + .set_index_option(IndexRecordOption::WithFreqsAndPositions), + ) + .set_stored(), + ); + + // build schema + let schema = schema_builder.build(); + + // create index on memory + let index = Index::create_in_ram(schema.clone()); + + // Build tokenizer with config file + let config_file = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("./examples") + .join("lindera.yml"); + let tokenizer = LinderaTokenizer::from_file(config_file.as_path())?; + + // register Lindera tokenizer + index.tokenizers().register("lang_ja", tokenizer); + + // create index writer + let mut index_writer = index.writer(50_000_000)?; + + // add document + index_writer.add_document(doc!( + id => "1", + title => "成田国際空港", + body => "成田国際空港(なりたこくさいくうこう、英: Narita International Airport)は、千葉県成田市南東部から芝山町北部にかけて建設された日本最大の国際拠点空港である。首都圏東部(東京の東60km)に位置している。空港コードはNRT。" + )).unwrap(); + + // add document + index_writer.add_document(doc!( + id => "2", + title => "東京国際空港", + body => "東京国際空港(とうきょうこくさいくうこう、英語: Tokyo International Airport)は、東京都大田区にある日本最大の空港。通称は羽田空港(はねだくうこう、英語: Haneda Airport)であり、単に「羽田」と呼ばれる場合もある。空港コードはHND。" + )).unwrap(); + + // add document + index_writer.add_document(doc!( + id => "3", + title => "関西国際空港", + body => "関西国際空港(かんさいこくさいくうこう、英: Kansai International Airport)は大阪市の南西35㎞に位置する西日本の国際的な玄関口であり、関西三空港の一つとして大阪国際空港(伊丹空港)、神戸空港とともに関西エアポート株式会社によって一体運営が行われている。" + )).unwrap(); + + // commit + index_writer.commit()?; + + // create reader + let reader = index.reader()?; + + // create searcher + let searcher = reader.searcher(); + + // create querhy parser + let query_parser = QueryParser::for_index(&index, vec![title, body]); + + // parse query + let query_str = "TOKYO"; + let query = query_parser.parse_query(query_str)?; + println!("Query String: {}", query_str); + + // search + println!("Parsed Query: {:?}", query); + let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; + println!("Search Result:"); + for (_, doc_address) in top_docs { + let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?; + println!("{}", retrieved_doc.to_json(&schema)); + } + + Ok(()) +} diff --git a/examples/unidic.rs b/examples/unidic.rs index a1ce587..b1a65d2 100644 --- a/examples/unidic.rs +++ b/examples/unidic.rs @@ -1,11 +1,15 @@ #[cfg(feature = "unidic")] fn main() -> tantivy::Result<()> { use tantivy::{ - collector::TopDocs, doc, query::QueryParser, schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, Document, Index, TantivyDocument + collector::TopDocs, + doc, + query::QueryParser, + schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, + Document, Index, TantivyDocument, }; - use lindera_core::mode::Mode; - use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind}; + use lindera::dictionary::DictionaryKind; + use lindera::{dictionary::load_dictionary_from_kind, mode::Mode, segmenter::Segmenter}; use lindera_tantivy::tokenizer::LinderaTokenizer; // create schema builder @@ -54,12 +58,11 @@ fn main() -> tantivy::Result<()> { let index = Index::create_in_ram(schema.clone()); // Tokenizer with UniDic - let dictionary_config = DictionaryConfig { - kind: Some(DictionaryKind::UniDic), - path: None, - }; - let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap(); - let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal); + let mode = Mode::Normal; + let dictionary = load_dictionary_from_kind(DictionaryKind::UniDic).unwrap(); + let user_dictionary = None; + let segmenter = Segmenter::new(mode, dictionary, user_dictionary); + let tokenizer = LinderaTokenizer::from_segmenter(segmenter); // register Lindera tokenizer index.tokenizers().register("lang_ja", tokenizer); diff --git a/src/stream.rs b/src/stream.rs index 868be95..8af2c38 100644 --- a/src/stream.rs +++ b/src/stream.rs @@ -1,6 +1,6 @@ use tantivy_tokenizer_api::{Token, TokenStream}; -use lindera_tokenizer::token::Token as LToken; +use lindera::token::Token as LToken; pub struct LinderaTokenStream<'a> { pub tokens: Vec>, @@ -23,10 +23,10 @@ impl<'a> TokenStream for LinderaTokenStream<'a> { } fn token(&self) -> &Token { - &self.token + self.token } fn token_mut(&mut self) -> &mut Token { - &mut self.token + self.token } } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index cb81864..1bc3b04 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1,8 +1,10 @@ -use lindera_core::{ - dictionary::{Dictionary, UserDictionary}, - mode::Mode, -}; -use lindera_tokenizer::tokenizer::Tokenizer as LTokenizer; +use std::path::Path; + +use lindera::character_filter::BoxCharacterFilter; +use lindera::token_filter::BoxTokenFilter; +use lindera::tokenizer::{Tokenizer as LTokenizer, TokenizerBuilder}; +use tantivy::Result; +use tantivy::TantivyError; use tantivy_tokenizer_api::{Token, Tokenizer}; use crate::stream::LinderaTokenStream; @@ -14,16 +16,56 @@ pub struct LinderaTokenizer { } impl LinderaTokenizer { - pub fn new( - dictionary: Dictionary, - user_dictionary: Option, - mode: Mode, - ) -> LinderaTokenizer { + /// Create a new `LinderaTokenizer`. + /// This function will create a new `LinderaTokenizer` with settings from the YAML file specified in the `LINDERA_CONFIG_PATH` environment variable. + pub fn new() -> Result { + let builder = TokenizerBuilder::new() + .map_err(|e| TantivyError::InvalidArgument(format!("{:?}", e)))?; + let tokenizer = builder + .build() + .map_err(|e| TantivyError::InvalidArgument(format!("{:?}", e)))?; + Ok(LinderaTokenizer { + tokenizer, + token: Default::default(), + }) + } + + /// Create a new `LinderaTokenizer`. + /// This function will create a new `LinderaTokenizer` with settings from the YAML file. + pub fn from_file(file_path: &Path) -> Result { + let builder = TokenizerBuilder::from_file(file_path) + .map_err(|e| TantivyError::InvalidArgument(format!("{:?}", e)))?; + let tokenizer = builder + .build() + .map_err(|e| TantivyError::InvalidArgument(format!("{:?}", e)))?; + Ok(LinderaTokenizer { + tokenizer, + token: Default::default(), + }) + } + + /// Create a new `LinderaTokenizer`. + /// This function will create a new `LinderaTokenizer` with the specified `lindera::segmenter::Segmenter`. + pub fn from_segmenter(segmenter: lindera::segmenter::Segmenter) -> LinderaTokenizer { LinderaTokenizer { - tokenizer: LTokenizer::new(dictionary, user_dictionary, mode), + tokenizer: LTokenizer::new(segmenter), token: Default::default(), } } + + /// Append a character filter to the tokenizer. + pub fn append_character_filter(&mut self, character_filter: BoxCharacterFilter) -> &mut Self { + self.tokenizer.append_character_filter(character_filter); + + self + } + + /// Append a token filter to the tokenizer. + pub fn append_token_filter(&mut self, token_filter: BoxTokenFilter) -> &mut Self { + self.tokenizer.token_filters.push(token_filter); + + self + } } impl Tokenizer for LinderaTokenizer { @@ -46,20 +88,20 @@ impl Tokenizer for LinderaTokenizer { feature = "cc-cedict" ))] mod tests { + use lindera::segmenter::Segmenter; use tantivy_tokenizer_api::{Token, TokenStream, Tokenizer}; - use lindera_core::mode::Mode; - use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind}; + use lindera::dictionary::{load_dictionary_from_kind, DictionaryKind}; + use lindera::mode::Mode; use super::LinderaTokenizer; fn token_stream_helper(text: &str, dictionary_kind: DictionaryKind) -> Vec { - let dictionary_config = DictionaryConfig { - kind: Some(dictionary_kind), - path: None, - }; - let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap(); - let mut tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal); + let mode = Mode::Normal; + let dictionary = load_dictionary_from_kind(dictionary_kind).unwrap(); + let user_dictionary = None; + let segmenter = Segmenter::new(mode, dictionary, user_dictionary); + let mut tokenizer = LinderaTokenizer::from_segmenter(segmenter); let mut token_stream = tokenizer.token_stream(text); let mut tokens: Vec = vec![];