From fce7dae639a026f3f291ffdb02e376ddaf51c87b Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Sun, 8 Jan 2023 18:01:14 +0800 Subject: [PATCH] move tokenizer API to seperate crate closes #1766 Finding tantivy tokenizers is a frustrating experience currently, since they need be updated for each tantivy version. That's unnecessary since the API is rather stable anyway. --- Cargo.toml | 6 +- README.md | 10 +- src/tokenizer/mod.rs | 10 +- src/tokenizer/ngram_tokenizer.rs | 3 +- src/tokenizer/tokenizer.rs | 232 +----------------------------- tokenizer-api/Cargo.toml | 10 ++ tokenizer-api/README.md | 6 + tokenizer-api/src/lib.rs | 236 +++++++++++++++++++++++++++++++ 8 files changed, 272 insertions(+), 241 deletions(-) create mode 100644 tokenizer-api/Cargo.toml create mode 100644 tokenizer-api/README.md create mode 100644 tokenizer-api/src/lib.rs diff --git a/Cargo.toml b/Cargo.toml index 56dd8f4713..bd674564b9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,6 +41,7 @@ tantivy-bitpacker = { version="0.2", path="./bitpacker" } common = { version = "0.3", path = "./common/", package = "tantivy-common" } fastfield_codecs = { version="0.2", path="./fastfield_codecs", default-features = false } ownedbytes = { version="0.3", path="./ownedbytes" } +tokenizer-api = { version="0.1", path="./tokenizer-api", package="tantivy-tokenizer-api" } stable_deref_trait = "1.2.0" rust-stemmers = "1.2.0" downcast-rs = "1.2.0" @@ -61,6 +62,7 @@ measure_time = "0.8.2" ciborium = { version = "0.2", optional = true} async-trait = "0.1.53" arc-swap = "1.5.0" +serde_derive = "1.0.152" [target.'cfg(windows)'.dependencies] winapi = "0.3.9" @@ -91,7 +93,7 @@ debug-assertions = true overflow-checks = true [features] -default = ["mmap", "lz4-compression" ] +default = ["mmap", "lz4-compression"] mmap = ["fs2", "tempfile", "memmap2"] brotli-compression = ["brotli"] @@ -105,7 +107,7 @@ unstable = [] # useful for benches. quickwit = ["ciborium"] [workspace] -members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes"] +members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes", "tokenizer-api"] # Following the "fail" crate best practises, we isolate # tests that define specific behavior in fail check points diff --git a/README.md b/README.md index fae8b9d232..116b4d00c7 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ Your mileage WILL vary depending on the nature of queries and their load. # Features - Full-text search -- Configurable tokenizer (stemming available for 17 Latin languages with third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)), Japanese ([lindera](https://github.com/lindera-morphology/lindera-tantivy), [Vaporetto](https://crates.io/crates/vaporetto_tantivy), and [tantivy-tokenizer-tiny-segmenter](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)) and Korean ([lindera](https://github.com/lindera-morphology/lindera-tantivy) + [lindera-ko-dic-builder](https://github.com/lindera-morphology/lindera-ko-dic-builder)) +- Configurable tokenizer (stemming available for 17 Latin languages) with third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)), Japanese ([lindera](https://github.com/lindera-morphology/lindera-tantivy), [Vaporetto](https://crates.io/crates/vaporetto_tantivy), and [tantivy-tokenizer-tiny-segmenter](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)) and Korean ([lindera](https://github.com/lindera-morphology/lindera-tantivy) + [lindera-ko-dic-builder](https://github.com/lindera-morphology/lindera-ko-dic-builder)) - Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:) - Tiny startup time (<10ms), perfect for command-line tools - BM25 scoring (the same as Lucene) @@ -42,12 +42,12 @@ Your mileage WILL vary depending on the nature of queries and their load. - Single valued and multivalued u64, i64, and f64 fast fields (equivalent of doc values in Lucene) - `&[u8]` fast fields - Text, i64, u64, f64, dates, and hierarchical facet fields -- LZ4 compressed document store +- Compressed document store (LZ4, Zstd, None, Brotli, Snap) - Range queries - Faceted search - Configurable indexing (optional term frequency and position indexing) - JSON Field -- Aggregation Collector: range buckets, average, and stats metrics +- Aggregation Collector: histogram, range buckets, average, and stats metrics - LogMergePolicy with deletes - Searcher Warmer API - Cheesy logo with a horse @@ -81,6 +81,10 @@ There are many ways to support this project. We use the GitHub Pull Request workflow: reference a GitHub ticket and/or include a comprehensive commit message when opening a PR. +## Tokenizer + +When implementing a tokenizer for tantivy depend on `tantivy-tokenizer-api` + ## Minimum supported Rust version Tantivy currently requires at least Rust 1.62 or later to compile. diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 3b511d2a62..b8961abe72 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -52,6 +52,8 @@ //! remove their inflection. This tokenizer is slower than the default one, //! but is recommended to improve recall. //! +//! # Custom tokenizer Library +//! Avoid using tantivy as dependency and prefer `tantivy-tokenizer-api` instead. //! //! # Custom tokenizers //! @@ -134,6 +136,10 @@ mod tokenizer; mod tokenizer_manager; mod whitespace_tokenizer; +pub use tokenizer_api::{ + BoxTokenFilter, BoxTokenStream, Token, TokenFilter, TokenStream, Tokenizer, +}; + pub use self::alphanum_only::AlphaNumOnlyFilter; pub use self::ascii_folding_filter::AsciiFoldingFilter; pub use self::facet_tokenizer::FacetTokenizer; @@ -146,9 +152,7 @@ pub use self::split_compound_words::SplitCompoundWords; pub use self::stemmer::{Language, Stemmer}; pub use self::stop_word_filter::StopWordFilter; pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString}; -pub use self::tokenizer::{ - BoxTokenFilter, BoxTokenStream, TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer, -}; +pub use self::tokenizer::TextAnalyzer; pub use self::tokenizer_manager::TokenizerManager; pub use self::whitespace_tokenizer::WhitespaceTokenizer; diff --git a/src/tokenizer/ngram_tokenizer.rs b/src/tokenizer/ngram_tokenizer.rs index 150e58e78d..05ddefa4a2 100644 --- a/src/tokenizer/ngram_tokenizer.rs +++ b/src/tokenizer/ngram_tokenizer.rs @@ -303,8 +303,7 @@ mod tests { use super::{utf8_codepoint_width, CodepointFrontiers, NgramTokenizer, StutteringIterator}; use crate::tokenizer::tests::assert_token; - use crate::tokenizer::tokenizer::Tokenizer; - use crate::tokenizer::{BoxTokenStream, Token}; + use crate::tokenizer::{BoxTokenStream, Token, Tokenizer}; fn test_helper(mut tokenizer: BoxTokenStream) -> Vec { let mut tokens: Vec = vec![]; diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index 203202e4a2..5fa37685bc 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -1,42 +1,9 @@ /// The tokenizer module contains all of the tools used to process /// text in `tantivy`. -use std::borrow::{Borrow, BorrowMut}; -use std::ops::{Deref, DerefMut}; - -use serde::{Deserialize, Serialize}; +use tokenizer_api::{BoxTokenFilter, BoxTokenStream, Tokenizer}; use crate::tokenizer::empty_tokenizer::EmptyTokenizer; -/// Token -#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] -pub struct Token { - /// Offset (byte index) of the first character of the token. - /// Offsets shall not be modified by token filters. - pub offset_from: usize, - /// Offset (byte index) of the last character of the token + 1. - /// The text that generated the token should be obtained by - /// &text[token.offset_from..token.offset_to] - pub offset_to: usize, - /// Position, expressed in number of tokens. - pub position: usize, - /// Actual text content of the token. - pub text: String, - /// Is the length expressed in term of number of original tokens. - pub position_length: usize, -} - -impl Default for Token { - fn default() -> Token { - Token { - offset_from: 0, - offset_to: 0, - position: usize::MAX, - text: String::with_capacity(200), - position_length: 1, - } - } -} - /// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`. /// /// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially. @@ -112,200 +79,3 @@ impl Clone for TextAnalyzer { } } } - -/// `Tokenizer` are in charge of splitting text into a stream of token -/// before indexing. -/// -/// See the [module documentation](crate::tokenizer) for more detail. -/// -/// # Warning -/// -/// This API may change to use associated types. -pub trait Tokenizer: 'static + Send + Sync + TokenizerClone { - /// Creates a token stream for a given `str`. - fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>; -} - -pub trait TokenizerClone { - fn box_clone(&self) -> Box; -} - -impl TokenizerClone for T { - fn box_clone(&self) -> Box { - Box::new(self.clone()) - } -} - -impl<'a> TokenStream for Box { - fn advance(&mut self) -> bool { - let token_stream: &mut dyn TokenStream = self.borrow_mut(); - token_stream.advance() - } - - fn token<'b>(&'b self) -> &'b Token { - let token_stream: &'b (dyn TokenStream + 'a) = self.borrow(); - token_stream.token() - } - - fn token_mut<'b>(&'b mut self) -> &'b mut Token { - let token_stream: &'b mut (dyn TokenStream + 'a) = self.borrow_mut(); - token_stream.token_mut() - } -} - -/// Simple wrapper of `Box`. -/// -/// See [`TokenStream`] for more information. -pub struct BoxTokenStream<'a>(Box); - -impl<'a, T> From for BoxTokenStream<'a> -where T: TokenStream + 'a -{ - fn from(token_stream: T) -> BoxTokenStream<'a> { - BoxTokenStream(Box::new(token_stream)) - } -} - -impl<'a> Deref for BoxTokenStream<'a> { - type Target = dyn TokenStream + 'a; - - fn deref(&self) -> &Self::Target { - &*self.0 - } -} -impl<'a> DerefMut for BoxTokenStream<'a> { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut *self.0 - } -} - -/// Simple wrapper of `Box`. -/// -/// See [`TokenFilter`] for more information. -pub struct BoxTokenFilter(Box); - -impl Deref for BoxTokenFilter { - type Target = dyn TokenFilter; - - fn deref(&self) -> &dyn TokenFilter { - &*self.0 - } -} - -impl From for BoxTokenFilter { - fn from(tokenizer: T) -> BoxTokenFilter { - BoxTokenFilter(Box::new(tokenizer)) - } -} - -/// `TokenStream` is the result of the tokenization. -/// -/// It consists consumable stream of `Token`s. -/// -/// # Example -/// -/// ``` -/// use tantivy::tokenizer::*; -/// -/// let tokenizer = TextAnalyzer::from(SimpleTokenizer) -/// .filter(RemoveLongFilter::limit(40)) -/// .filter(LowerCaser); -/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer"); -/// { -/// let token = token_stream.next().unwrap(); -/// assert_eq!(&token.text, "hello"); -/// assert_eq!(token.offset_from, 0); -/// assert_eq!(token.offset_to, 5); -/// assert_eq!(token.position, 0); -/// } -/// { -/// let token = token_stream.next().unwrap(); -/// assert_eq!(&token.text, "happy"); -/// assert_eq!(token.offset_from, 7); -/// assert_eq!(token.offset_to, 12); -/// assert_eq!(token.position, 1); -/// } -/// ``` -pub trait TokenStream { - /// Advance to the next token - /// - /// Returns false if there are no other tokens. - fn advance(&mut self) -> bool; - - /// Returns a reference to the current token. - fn token(&self) -> &Token; - - /// Returns a mutable reference to the current token. - fn token_mut(&mut self) -> &mut Token; - - /// Helper to iterate over tokens. It - /// simply combines a call to `.advance()` - /// and `.token()`. - /// - /// ``` - /// use tantivy::tokenizer::*; - /// - /// let tokenizer = TextAnalyzer::from(SimpleTokenizer) - /// .filter(RemoveLongFilter::limit(40)) - /// .filter(LowerCaser); - /// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer"); - /// while let Some(token) = token_stream.next() { - /// println!("Token {:?}", token.text); - /// } - /// ``` - fn next(&mut self) -> Option<&Token> { - if self.advance() { - Some(self.token()) - } else { - None - } - } - - /// Helper function to consume the entire `TokenStream` - /// and push the tokens to a sink function. - /// - /// Remove this. - fn process(&mut self, sink: &mut dyn FnMut(&Token)) { - while self.advance() { - sink(self.token()); - } - } -} - -pub trait TokenFilterClone { - fn box_clone(&self) -> BoxTokenFilter; -} - -/// Trait for the pluggable components of `Tokenizer`s. -pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone { - /// Wraps a token stream and returns the modified one. - fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a>; -} - -impl TokenFilterClone for T { - fn box_clone(&self) -> BoxTokenFilter { - BoxTokenFilter::from(self.clone()) - } -} - -#[cfg(test)] -mod test { - use super::Token; - - #[test] - fn clone() { - let t1 = Token { - position: 1, - offset_from: 2, - offset_to: 3, - text: "abc".to_string(), - position_length: 1, - }; - let t2 = t1.clone(); - - assert_eq!(t1.position, t2.position); - assert_eq!(t1.offset_from, t2.offset_from); - assert_eq!(t1.offset_to, t2.offset_to); - assert_eq!(t1.text, t2.text); - } -} diff --git a/tokenizer-api/Cargo.toml b/tokenizer-api/Cargo.toml new file mode 100644 index 0000000000..e136d27b71 --- /dev/null +++ b/tokenizer-api/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "tantivy-tokenizer-api" +version = "0.1.0" +edition = "2021" +description = "Tokenizer API of tantivy" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +serde = { version = "1.0.152", features = ["derive"] } diff --git a/tokenizer-api/README.md b/tokenizer-api/README.md new file mode 100644 index 0000000000..401defc233 --- /dev/null +++ b/tokenizer-api/README.md @@ -0,0 +1,6 @@ + +#Tokenizer-API + +An API to interface a tokenizer with tantivy. + +The API will be kept stable in order to not break support for existing tokenizers. diff --git a/tokenizer-api/src/lib.rs b/tokenizer-api/src/lib.rs new file mode 100644 index 0000000000..35b04738b5 --- /dev/null +++ b/tokenizer-api/src/lib.rs @@ -0,0 +1,236 @@ +//! Tokenizer are in charge of chopping text into a stream of tokens +//! ready for indexing. This is an seperate crate from tantivy, so implementors don't need to update +//! for each new tantivy version. +//! +//! To add support for a tokenizer, implement the [`Tokenizer`](crate::Tokenizer) trait. +//! Checkout the [tantivy repo](https://github.com/quickwit-oss/tantivy/tree/main/src/tokenizer) for some examples. + +use std::borrow::{Borrow, BorrowMut}; +use std::ops::{Deref, DerefMut}; + +use serde::{Deserialize, Serialize}; + +/// Token +#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] +pub struct Token { + /// Offset (byte index) of the first character of the token. + /// Offsets shall not be modified by token filters. + pub offset_from: usize, + /// Offset (byte index) of the last character of the token + 1. + /// The text that generated the token should be obtained by + /// &text[token.offset_from..token.offset_to] + pub offset_to: usize, + /// Position, expressed in number of tokens. + pub position: usize, + /// Actual text content of the token. + pub text: String, + /// Is the length expressed in term of number of original tokens. + pub position_length: usize, +} + +impl Default for Token { + fn default() -> Token { + Token { + offset_from: 0, + offset_to: 0, + position: usize::MAX, + text: String::with_capacity(200), + position_length: 1, + } + } +} + +/// `Tokenizer` are in charge of splitting text into a stream of token +/// before indexing. +/// +/// # Warning +/// +/// This API may change to use associated types. +pub trait Tokenizer: 'static + Send + Sync + TokenizerClone { + /// Creates a token stream for a given `str`. + fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>; +} + +pub trait TokenizerClone { + fn box_clone(&self) -> Box; +} + +impl TokenizerClone for T { + fn box_clone(&self) -> Box { + Box::new(self.clone()) + } +} + +/// Simple wrapper of `Box`. +/// +/// See [`TokenStream`] for more information. +pub struct BoxTokenStream<'a>(Box); + +impl<'a, T> From for BoxTokenStream<'a> +where + T: TokenStream + 'a, +{ + fn from(token_stream: T) -> BoxTokenStream<'a> { + BoxTokenStream(Box::new(token_stream)) + } +} + +impl<'a> Deref for BoxTokenStream<'a> { + type Target = dyn TokenStream + 'a; + + fn deref(&self) -> &Self::Target { + &*self.0 + } +} +impl<'a> DerefMut for BoxTokenStream<'a> { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut *self.0 + } +} + +impl<'a> TokenStream for Box { + fn advance(&mut self) -> bool { + let token_stream: &mut dyn TokenStream = self.borrow_mut(); + token_stream.advance() + } + + fn token<'b>(&'b self) -> &'b Token { + let token_stream: &'b (dyn TokenStream + 'a) = self.borrow(); + token_stream.token() + } + + fn token_mut<'b>(&'b mut self) -> &'b mut Token { + let token_stream: &'b mut (dyn TokenStream + 'a) = self.borrow_mut(); + token_stream.token_mut() + } +} + +/// `TokenStream` is the result of the tokenization. +/// +/// It consists consumable stream of `Token`s. +/// +/// # Example +/// +/// ``` +/// use tantivy::tokenizer::*; +/// +/// let tokenizer = TextAnalyzer::from(SimpleTokenizer) +/// .filter(RemoveLongFilter::limit(40)) +/// .filter(LowerCaser); +/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer"); +/// { +/// let token = token_stream.next().unwrap(); +/// assert_eq!(&token.text, "hello"); +/// assert_eq!(token.offset_from, 0); +/// assert_eq!(token.offset_to, 5); +/// assert_eq!(token.position, 0); +/// } +/// { +/// let token = token_stream.next().unwrap(); +/// assert_eq!(&token.text, "happy"); +/// assert_eq!(token.offset_from, 7); +/// assert_eq!(token.offset_to, 12); +/// assert_eq!(token.position, 1); +/// } +/// ``` +pub trait TokenStream { + /// Advance to the next token + /// + /// Returns false if there are no other tokens. + fn advance(&mut self) -> bool; + + /// Returns a reference to the current token. + fn token(&self) -> &Token; + + /// Returns a mutable reference to the current token. + fn token_mut(&mut self) -> &mut Token; + + /// Helper to iterate over tokens. It + /// simply combines a call to `.advance()` + /// and `.token()`. + /// + /// ``` + /// use tantivy::tokenizer::*; + /// + /// let tokenizer = TextAnalyzer::from(SimpleTokenizer) + /// .filter(RemoveLongFilter::limit(40)) + /// .filter(LowerCaser); + /// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer"); + /// while let Some(token) = token_stream.next() { + /// println!("Token {:?}", token.text); + /// } + /// ``` + fn next(&mut self) -> Option<&Token> { + if self.advance() { + Some(self.token()) + } else { + None + } + } + + /// Helper function to consume the entire `TokenStream` + /// and push the tokens to a sink function. + /// + fn process(&mut self, sink: &mut dyn FnMut(&Token)) { + while self.advance() { + sink(self.token()); + } + } +} + +/// Simple wrapper of `Box`. +/// +/// See [`TokenFilter`] for more information. +pub struct BoxTokenFilter(Box); + +impl Deref for BoxTokenFilter { + type Target = dyn TokenFilter; + + fn deref(&self) -> &dyn TokenFilter { + &*self.0 + } +} + +impl From for BoxTokenFilter { + fn from(tokenizer: T) -> BoxTokenFilter { + BoxTokenFilter(Box::new(tokenizer)) + } +} + +pub trait TokenFilterClone { + fn box_clone(&self) -> BoxTokenFilter; +} + +/// Trait for the pluggable components of `Tokenizer`s. +pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone { + /// Wraps a token stream and returns the modified one. + fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a>; +} + +impl TokenFilterClone for T { + fn box_clone(&self) -> BoxTokenFilter { + BoxTokenFilter::from(self.clone()) + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn clone() { + let t1 = Token { + position: 1, + offset_from: 2, + offset_to: 3, + text: "abc".to_string(), + position_length: 1, + }; + let t2 = t1.clone(); + + assert_eq!(t1.position, t2.position); + assert_eq!(t1.offset_from, t2.offset_from); + assert_eq!(t1.offset_to, t2.offset_to); + assert_eq!(t1.text, t2.text); + } +}