From 6afa5eda00cfcde2766ba59a74c5cea050d1dbf3 Mon Sep 17 00:00:00 2001 From: Koichi Akabe Date: Tue, 7 Mar 2023 10:51:13 +0900 Subject: [PATCH] Update README to describe zstd format (#88) * Update README to describe zstd format * update * update * Update README-ja.md --- README-ja.md | 13 +++++++++++++ README.md | 15 +++++++++++++++ vaporetto/README.md | 24 +++++++++++++++++++----- vaporetto/src/lib.rs | 13 ++++++++----- 4 files changed, 55 insertions(+), 10 deletions(-) diff --git a/README-ja.md b/README-ja.md index 41596f1..b547c62 100644 --- a/README-ja.md +++ b/README-ja.md @@ -45,6 +45,19 @@ Vaporetto はトークン化モデルを生成するための方法を3つ用意 ヴェネツィア は イタリア に あり ます 。 ``` +##### Vaporetto APIs を使用する際の注意点 + +配布モデルは zstd 形式で圧縮されています。 +*vaporetto* APIでこれらの圧縮済みモデルを読み込むには、APIの外側で展開する必要があります。 + +```rust +// zstd クレートまたは ruzstd クレートが必要 +let reader = zstd::Decoder::new(File::open("path/to/model.bin.zst")?)?; +let model = Model::read(reader)?; +``` + +最近のLinuxディストリビューションに同梱されている *unzstd* コマンドを利用して展開することもできます。 + #### KyTea のモデルを変換する 2つ目の方法も単純で、 KyTea で学習されたモデルを変換することです。 diff --git a/README.md b/README.md index 6bbd482..ef327ad 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,21 @@ The following will be output: ヴェネツィア は イタリア に あり ます 。 ``` +##### Notes for Vaporetto APIs + +The distribution models are compressed in the zstd format. +If you want to load these compressed models with the *vaporetto* API, +you must decompress them outside of the API. + +```rust +// Requires zstd crate or ruzstd crate +let reader = zstd::Decoder::new(File::open("path/to/model.bin.zst")?)?; +let model = Model::read(reader)?; +``` + +You can also decompress the file using the *unzstd* command, which is bundled with modern Linux +distributions. + #### Convert KyTea's Model The second is also a simple way, which is to convert a model trained by KyTea. diff --git a/vaporetto/README.md b/vaporetto/README.md index f3338b0..9bf9ce7 100644 --- a/vaporetto/README.md +++ b/vaporetto/README.md @@ -9,15 +9,15 @@ use std::fs::File; use vaporetto::{Model, Predictor, Sentence}; -let f = File::open("../resources/model.bin").unwrap(); -let model = Model::read(f).unwrap(); -let predictor = Predictor::new(model, true).unwrap(); +let f = File::open("../resources/model.bin")?; +let model = Model::read(f)?; +let predictor = Predictor::new(model, true)?; let mut buf = String::new(); let mut s = Sentence::default(); -s.update_raw("まぁ社長は火星猫だ").unwrap(); +s.update_raw("まぁ社長は火星猫だ")?; predictor.predict(&mut s); s.fill_tags(); s.write_tokenized_text(&mut buf); @@ -26,7 +26,7 @@ assert_eq!( buf, ); -s.update_raw("まぁ良いだろう").unwrap(); +s.update_raw("まぁ良いだろう")?; predictor.predict(&mut s); s.fill_tags(); s.write_tokenized_text(&mut buf); @@ -53,6 +53,20 @@ The following features are enabled by default: * `tag-prediction` - Enables tag prediction. * `charwise-pma` - Uses the [Charwise Daachorse](https://docs.rs/daachorse/latest/daachorse/charwise/index.html) instead of the standard version for faster prediction, although it can make to load a model file slower. +## Notes for distributed models + +The distributed models are compressed in the zstd format. +If you want to load these compressed models, you must decompress them outside of the API. + +```rust +// Requires zstd crate or ruzstd crate +let reader = zstd::Decoder::new(File::open("path/to/model.bin.zst")?)?; +let model = Model::read(reader)?; +``` + +You can also decompress the file using the *unzstd* command, which is bundled with modern Linux +distributions. + ## License Licensed under either of diff --git a/vaporetto/src/lib.rs b/vaporetto/src/lib.rs index dfc5ae0..56726aa 100644 --- a/vaporetto/src/lib.rs +++ b/vaporetto/src/lib.rs @@ -9,19 +9,20 @@ ## Examples ``` +# fn main() -> Result<(), Box> { use std::fs::File; use vaporetto::{Model, Predictor, Sentence}; -let f = File::open(\"../resources/model.bin\").unwrap(); -let model = Model::read(f).unwrap(); -let predictor = Predictor::new(model, true).unwrap(); +let f = File::open(\"../resources/model.bin\")?; +let model = Model::read(f)?; +let predictor = Predictor::new(model, true)?; let mut buf = String::new(); let mut s = Sentence::default(); -s.update_raw(\"まぁ社長は火星猫だ\").unwrap(); +s.update_raw(\"まぁ社長は火星猫だ\")?; predictor.predict(&mut s); s.fill_tags(); s.write_tokenized_text(&mut buf); @@ -30,7 +31,7 @@ assert_eq!( buf, ); -s.update_raw(\"まぁ良いだろう\").unwrap(); +s.update_raw(\"まぁ良いだろう\")?; predictor.predict(&mut s); s.fill_tags(); s.write_tokenized_text(&mut buf); @@ -38,6 +39,8 @@ assert_eq!( \"まぁ/副詞/マー 良い/形容詞/ヨイ だろう/助動詞/ダロー\", buf, ); +# Ok(()) +# } ``` " )]