Skip to content

Commit

Permalink
Update Lindera to 0.38.0 (#85)
Browse files Browse the repository at this point in the history
  • Loading branch information
mosuka authored Nov 19, 2024
1 parent e276de1 commit bfc893c
Show file tree
Hide file tree
Showing 13 changed files with 463 additions and 225 deletions.
7 changes: 4 additions & 3 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,19 @@ on:

jobs:
create-release:
name: Create Release
name: Upload artifact
runs-on: ubuntu-latest
steps:
- id: create-release
uses: actions/create-release@v1.0.0
uses: softprops/action-gh-release@v2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
name: Release ${{ github.ref_name }}
tag_name: ${{ github.ref }}
release_name: Release ${{ github.ref }}
draft: false
prerelease: false
generate_release_notes: true

publish-crates:
name: Publish crate
Expand Down
133 changes: 0 additions & 133 deletions CHANGES.md

This file was deleted.

23 changes: 12 additions & 11 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,29 +1,30 @@
[package]
name = "lindera-tantivy"
version = "0.32.2"
version = "0.38.0"
edition = "2021"
description = "Lindera Tokenizer for Tantivy."
documentation = "https://docs.rs/lindera-tantivy"
homepage = "https://github.com/lindera-morphology/lindera-tantivy"
repository = "https://github.com/lindera-morphology/lindera-tantivy"
homepage = "https://github.com/lindera/lindera-tantivy"
repository = "https://github.com/lindera/lindera-tantivy"
readme = "README.md"
keywords = ["tokenizer", "tantivy", "lindera"]
categories = ["text-processing"]
license = "MIT"

[features]
default = []
ipadic = ["lindera-tokenizer/ipadic"] # Japanese dictionary
unidic = ["lindera-tokenizer/unidic"] # Japanese dictionary
ko-dic = ["lindera-tokenizer/ko-dic"] # Korean dictionary
cc-cedict = ["lindera-tokenizer/cc-cedict"] # Chinese dictionary
default = [] # No directories included
ipadic = ["lindera/ipadic"] # Include IPADIC dictionary (Japanese)
ipadic-neologd = ["lindera/ipadic-neologd"] # Include IPADIC NEologd dictionary (Japanese)
unidic = ["lindera/unidic"] # Include UniDic dictionary (Japanese)
ko-dic = ["lindera/ko-dic"] # Include ko-dic dictionary (Korean)
cc-cedict = ["lindera/cc-cedict"] # Include CC-CEDICT dictionary (Chinese)
compress = ["lindera/compress"] # Compress dictionaries

[dependencies]
tantivy-tokenizer-api = "0.3.0"
tantivy = "0.22.0"

lindera-core = "0.32.2"
lindera-dictionary = "0.32.2"
lindera-tokenizer = "0.32.2"
lindera = "0.38.0"

[dev-dependencies]
criterion = { version = "0.5.1", features = ["html_reports"] }
Expand Down
150 changes: 138 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,24 @@ The following example enables IPADIC.

```
[dependencies]
lindera-core = "0.32.2"
lindera-dictionary = "0.32.2"
lindera-tantivy = { version = "0.32.2", features = ["ipadic"] }
lindera = "0.38"
lindera-tantivy = { version = "0.38.0", features = ["ipadic"] }
```

### Basic example

```rust
fn main() -> tantivy::Result<()> {
use tantivy::{
collector::TopDocs, doc, query::QueryParser, schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, Document, Index, TantivyDocument
collector::TopDocs,
doc,
query::QueryParser,
schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions},
Document, Index, TantivyDocument,
};

use lindera_core::mode::Mode;
use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind};
use lindera::dictionary::DictionaryKind;
use lindera::{dictionary::load_dictionary_from_kind, mode::Mode, segmenter::Segmenter};
use lindera_tantivy::tokenizer::LinderaTokenizer;

// create schema builder
Expand Down Expand Up @@ -75,12 +78,11 @@ fn main() -> tantivy::Result<()> {
let index = Index::create_in_ram(schema.clone());

// Tokenizer with IPADIC
let dictionary_config = DictionaryConfig {
kind: Some(DictionaryKind::IPADIC),
path: None,
};
let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap();
let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);
let mode = Mode::Normal;
let dictionary = load_dictionary_from_kind(DictionaryKind::IPADIC).unwrap();
let user_dictionary = None;
let segmenter = Segmenter::new(mode, dictionary, user_dictionary);
let tokenizer = LinderaTokenizer::from_segmenter(segmenter);

// register Lindera tokenizer
index.tokenizers().register("lang_ja", tokenizer);
Expand Down Expand Up @@ -138,6 +140,130 @@ fn main() -> tantivy::Result<()> {
}
```

### Config by YAML

```rust
use std::path::PathBuf;

fn main() -> tantivy::Result<()> {
use tantivy::{
collector::TopDocs,
doc,
query::QueryParser,
schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions},
Document, Index, TantivyDocument,
};

use lindera_tantivy::tokenizer::LinderaTokenizer;

// create schema builder
let mut schema_builder = Schema::builder();

// add id field
let id = schema_builder.add_text_field(
"id",
TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("raw")
.set_index_option(IndexRecordOption::Basic),
)
.set_stored(),
);

// add title field
let title = schema_builder.add_text_field(
"title",
TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("lang_ja")
.set_index_option(IndexRecordOption::WithFreqsAndPositions),
)
.set_stored(),
);

// add body field
let body = schema_builder.add_text_field(
"body",
TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("lang_ja")
.set_index_option(IndexRecordOption::WithFreqsAndPositions),
)
.set_stored(),
);

// build schema
let schema = schema_builder.build();

// create index on memory
let index = Index::create_in_ram(schema.clone());

// Build tokenizer with config file
let config_file = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("./examples")
.join("lindera.yml");
let tokenizer = LinderaTokenizer::from_file(config_file.as_path())?;

// register Lindera tokenizer
index.tokenizers().register("lang_ja", tokenizer);

// create index writer
let mut index_writer = index.writer(50_000_000)?;

// add document
index_writer.add_document(doc!(
id => "1",
title => "成田国際空港",
body => "成田国際空港(なりたこくさいくうこう、英: Narita International Airport)は、千葉県成田市南東部から芝山町北部にかけて建設された日本最大の国際拠点空港である。首都圏東部(東京の東60km)に位置している。空港コードはNRT。"
)).unwrap();

// add document
index_writer.add_document(doc!(
id => "2",
title => "東京国際空港",
body => "東京国際空港(とうきょうこくさいくうこう、英語: Tokyo International Airport)は、東京都大田区にある日本最大の空港。通称は羽田空港(はねだくうこう、英語: Haneda Airport)であり、単に「羽田」と呼ばれる場合もある。空港コードはHND。"
)).unwrap();

// add document
index_writer.add_document(doc!(
id => "3",
title => "関西国際空港",
body => "関西国際空港(かんさいこくさいくうこう、英: Kansai International Airport)は大阪市の南西35㎞に位置する西日本の国際的な玄関口であり、関西三空港の一つとして大阪国際空港(伊丹空港)、神戸空港とともに関西エアポート株式会社によって一体運営が行われている。"
)).unwrap();

// commit
index_writer.commit()?;

// create reader
let reader = index.reader()?;

// create searcher
let searcher = reader.searcher();

// create querhy parser
let query_parser = QueryParser::for_index(&index, vec![title, body]);

// parse query
let query_str = "TOKYO";
let query = query_parser.parse_query(query_str)?;
println!("Query String: {}", query_str);

// search
println!("Parsed Query: {:?}", query);
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
println!("Search Result:");
for (_, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("{}", retrieved_doc.to_json(&schema));
}

Ok(())
}
```

## API reference

The API reference is available. Please see following URL:
Expand Down
Loading

0 comments on commit bfc893c

Please sign in to comment.