move tokenizer API to seperate crate

closes #1766 Finding tantivy tokenizers is a frustrating experience currently, since they need be updated for each tantivy version. That's unnecessary since the API is rather stable anyway.
quickwit-oss · Jan 8, 2023 · c34a9e3 · c34a9e3
1 parent 4f9efe6
commit c34a9e3
Show file tree

Hide file tree

Showing 8 changed files with 268 additions and 240 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -61,6 +61,7 @@ tantivy-query-grammar = { version= "0.19.0", path="./query-grammar" }
 tantivy-bitpacker = 		{ version= "0.3", path="./bitpacker" }
 common = 								{ version= "0.5", path = "./common/", package = "tantivy-common" }
 fastfield_codecs = 			{ version= "0.3", path="./fastfield_codecs", default-features = false }
+tokenizer-api = { version="0.1", path="./tokenizer-api", package="tantivy-tokenizer-api" }
 
 [target.'cfg(windows)'.dependencies]
 winapi = "0.3.9"
@@ -106,7 +107,7 @@ unstable = [] # useful for benches.
 quickwit = ["sstable"]
 
 [workspace]
-members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes", "stacker", "sstable", "columnar"]
+members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes", "stacker", "sstable", "columnar", "tokenizer-api"]
 
 # Following the "fail" crate best practises, we isolate
 # tests that define specific behavior in fail check points

diff --git a/README.md b/README.md
@@ -29,7 +29,7 @@ Your mileage WILL vary depending on the nature of queries and their load.
 # Features
 
 - Full-text search
-- Configurable tokenizer (stemming available for 17 Latin languages with third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)), Japanese ([lindera](https://github.com/lindera-morphology/lindera-tantivy), [Vaporetto](https://crates.io/crates/vaporetto_tantivy), and [tantivy-tokenizer-tiny-segmenter](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)) and Korean ([lindera](https://github.com/lindera-morphology/lindera-tantivy) + [lindera-ko-dic-builder](https://github.com/lindera-morphology/lindera-ko-dic-builder))
+- Configurable tokenizer (stemming available for 17 Latin languages) with third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)), Japanese ([lindera](https://github.com/lindera-morphology/lindera-tantivy), [Vaporetto](https://crates.io/crates/vaporetto_tantivy), and [tantivy-tokenizer-tiny-segmenter](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)) and Korean ([lindera](https://github.com/lindera-morphology/lindera-tantivy) + [lindera-ko-dic-builder](https://github.com/lindera-morphology/lindera-ko-dic-builder))
 - Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:)
 - Tiny startup time (<10ms), perfect for command-line tools
 - BM25 scoring (the same as Lucene)
@@ -42,12 +42,12 @@ Your mileage WILL vary depending on the nature of queries and their load.
 - Single valued and multivalued u64, i64, and f64 fast fields (equivalent of doc values in Lucene)
 - `&[u8]` fast fields
 - Text, i64, u64, f64, dates, and hierarchical facet fields
-- LZ4 compressed document store
+- Compressed document store (LZ4, Zstd, None, Brotli, Snap)
 - Range queries
 - Faceted search
 - Configurable indexing (optional term frequency and position indexing)
 - JSON Field
-- Aggregation Collector: range buckets, average, and stats metrics
+- Aggregation Collector: histogram, range buckets, average, and stats metrics
 - LogMergePolicy with deletes
 - Searcher Warmer API
 - Cheesy logo with a horse
@@ -81,6 +81,10 @@ There are many ways to support this project.
 
 We use the GitHub Pull Request workflow: reference a GitHub ticket and/or include a comprehensive commit message when opening a PR.
 
+## Tokenizer
+
+When implementing a tokenizer for tantivy depend on `tantivy-tokenizer-api`
+
 ## Minimum supported Rust version
 
 Tantivy currently requires at least Rust 1.62 or later to compile.

diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs
@@ -52,6 +52,8 @@
 //! remove their inflection. This tokenizer is slower than the default one,
 //! but is recommended to improve recall.
 //!
+//! # Custom tokenizer Library
+//! Avoid using tantivy as dependency and prefer `tantivy-tokenizer-api` instead.
 //!
 //! # Custom tokenizers
 //!
@@ -134,6 +136,10 @@ mod tokenizer;
 mod tokenizer_manager;
 mod whitespace_tokenizer;
 
+pub use tokenizer_api::{
+    BoxTokenFilter, BoxTokenStream, Token, TokenFilter, TokenStream, Tokenizer,
+};
+
 pub use self::alphanum_only::AlphaNumOnlyFilter;
 pub use self::ascii_folding_filter::AsciiFoldingFilter;
 pub use self::facet_tokenizer::FacetTokenizer;
@@ -146,9 +152,7 @@ pub use self::split_compound_words::SplitCompoundWords;
 pub use self::stemmer::{Language, Stemmer};
 pub use self::stop_word_filter::StopWordFilter;
 pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
-pub use self::tokenizer::{
-    BoxTokenFilter, BoxTokenStream, TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer,
-};
+pub use self::tokenizer::TextAnalyzer;
 pub use self::tokenizer_manager::TokenizerManager;
 pub use self::whitespace_tokenizer::WhitespaceTokenizer;
 

diff --git a/src/tokenizer/ngram_tokenizer.rs b/src/tokenizer/ngram_tokenizer.rs
@@ -303,8 +303,7 @@ mod tests {
 
     use super::{utf8_codepoint_width, CodepointFrontiers, NgramTokenizer, StutteringIterator};
     use crate::tokenizer::tests::assert_token;
-    use crate::tokenizer::tokenizer::Tokenizer;
-    use crate::tokenizer::{BoxTokenStream, Token};
+    use crate::tokenizer::{BoxTokenStream, Token, Tokenizer};
 
     fn test_helper(mut tokenizer: BoxTokenStream) -> Vec<Token> {
         let mut tokens: Vec<Token> = vec![];

diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs
@@ -1,42 +1,9 @@
 /// The tokenizer module contains all of the tools used to process
 /// text in `tantivy`.
-use std::borrow::{Borrow, BorrowMut};
-use std::ops::{Deref, DerefMut};
-
-use serde::{Deserialize, Serialize};
+use tokenizer_api::{BoxTokenFilter, BoxTokenStream, Tokenizer};
 
 use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
 
-/// Token
-#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
-pub struct Token {
-    /// Offset (byte index) of the first character of the token.
-    /// Offsets shall not be modified by token filters.
-    pub offset_from: usize,
-    /// Offset (byte index) of the last character of the token + 1.
-    /// The text that generated the token should be obtained by
-    /// &text[token.offset_from..token.offset_to]
-    pub offset_to: usize,
-    /// Position, expressed in number of tokens.
-    pub position: usize,
-    /// Actual text content of the token.
-    pub text: String,
-    /// Is the length expressed in term of number of original tokens.
-    pub position_length: usize,
-}
-
-impl Default for Token {
-    fn default() -> Token {
-        Token {
-            offset_from: 0,
-            offset_to: 0,
-            position: usize::MAX,
-            text: String::with_capacity(200),
-            position_length: 1,
-        }
-    }
-}
-
 /// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
 ///
 /// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially.
@@ -112,200 +79,3 @@ impl Clone for TextAnalyzer {
         }
     }
 }
-
-/// `Tokenizer` are in charge of splitting text into a stream of token
-/// before indexing.
-///
-/// See the [module documentation](crate::tokenizer) for more detail.
-///
-/// # Warning
-///
-/// This API may change to use associated types.
-pub trait Tokenizer: 'static + Send + Sync + TokenizerClone {
-    /// Creates a token stream for a given `str`.
-    fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
-}
-
-pub trait TokenizerClone {
-    fn box_clone(&self) -> Box<dyn Tokenizer>;
-}
-
-impl<T: Tokenizer + Clone> TokenizerClone for T {
-    fn box_clone(&self) -> Box<dyn Tokenizer> {
-        Box::new(self.clone())
-    }
-}
-
-impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
-    fn advance(&mut self) -> bool {
-        let token_stream: &mut dyn TokenStream = self.borrow_mut();
-        token_stream.advance()
-    }
-
-    fn token<'b>(&'b self) -> &'b Token {
-        let token_stream: &'b (dyn TokenStream + 'a) = self.borrow();
-        token_stream.token()
-    }
-
-    fn token_mut<'b>(&'b mut self) -> &'b mut Token {
-        let token_stream: &'b mut (dyn TokenStream + 'a) = self.borrow_mut();
-        token_stream.token_mut()
-    }
-}
-
-/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
-///
-/// See [`TokenStream`] for more information.
-pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
-
-impl<'a, T> From<T> for BoxTokenStream<'a>
-where T: TokenStream + 'a
-{
-    fn from(token_stream: T) -> BoxTokenStream<'a> {
-        BoxTokenStream(Box::new(token_stream))
-    }
-}
-
-impl<'a> Deref for BoxTokenStream<'a> {
-    type Target = dyn TokenStream + 'a;
-
-    fn deref(&self) -> &Self::Target {
-        &*self.0
-    }
-}
-impl<'a> DerefMut for BoxTokenStream<'a> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        &mut *self.0
-    }
-}
-
-/// Simple wrapper of `Box<dyn TokenFilter + 'a>`.
-///
-/// See [`TokenFilter`] for more information.
-pub struct BoxTokenFilter(Box<dyn TokenFilter>);
-
-impl Deref for BoxTokenFilter {
-    type Target = dyn TokenFilter;
-
-    fn deref(&self) -> &dyn TokenFilter {
-        &*self.0
-    }
-}
-
-impl<T: TokenFilter> From<T> for BoxTokenFilter {
-    fn from(tokenizer: T) -> BoxTokenFilter {
-        BoxTokenFilter(Box::new(tokenizer))
-    }
-}
-
-/// `TokenStream` is the result of the tokenization.
-///
-/// It consists consumable stream of `Token`s.
-///
-/// # Example
-///
-/// ```
-/// use tantivy::tokenizer::*;
-///
-/// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
-///        .filter(RemoveLongFilter::limit(40))
-///        .filter(LowerCaser);
-/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
-/// {
-///     let token = token_stream.next().unwrap();
-///     assert_eq!(&token.text, "hello");
-///     assert_eq!(token.offset_from, 0);
-///     assert_eq!(token.offset_to, 5);
-///     assert_eq!(token.position, 0);
-/// }
-/// {
-///     let token = token_stream.next().unwrap();
-///     assert_eq!(&token.text, "happy");
-///     assert_eq!(token.offset_from, 7);
-///     assert_eq!(token.offset_to, 12);
-///     assert_eq!(token.position, 1);
-/// }
-/// ```
-pub trait TokenStream {
-    /// Advance to the next token
-    ///
-    /// Returns false if there are no other tokens.
-    fn advance(&mut self) -> bool;
-
-    /// Returns a reference to the current token.
-    fn token(&self) -> &Token;
-
-    /// Returns a mutable reference to the current token.
-    fn token_mut(&mut self) -> &mut Token;
-
-    /// Helper to iterate over tokens. It
-    /// simply combines a call to `.advance()`
-    /// and `.token()`.
-    ///
-    /// ```
-    /// use tantivy::tokenizer::*;
-    ///
-    /// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
-    ///       .filter(RemoveLongFilter::limit(40))
-    ///       .filter(LowerCaser);
-    /// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
-    /// while let Some(token) = token_stream.next() {
-    ///     println!("Token {:?}", token.text);
-    /// }
-    /// ```
-    fn next(&mut self) -> Option<&Token> {
-        if self.advance() {
-            Some(self.token())
-        } else {
-            None
-        }
-    }
-
-    /// Helper function to consume the entire `TokenStream`
-    /// and push the tokens to a sink function.
-    ///
-    /// Remove this.
-    fn process(&mut self, sink: &mut dyn FnMut(&Token)) {
-        while self.advance() {
-            sink(self.token());
-        }
-    }
-}
-
-pub trait TokenFilterClone {
-    fn box_clone(&self) -> BoxTokenFilter;
-}
-
-/// Trait for the pluggable components of `Tokenizer`s.
-pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
-    /// Wraps a token stream and returns the modified one.
-    fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a>;
-}
-
-impl<T: TokenFilter + Clone> TokenFilterClone for T {
-    fn box_clone(&self) -> BoxTokenFilter {
-        BoxTokenFilter::from(self.clone())
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use super::Token;
-
-    #[test]
-    fn clone() {
-        let t1 = Token {
-            position: 1,
-            offset_from: 2,
-            offset_to: 3,
-            text: "abc".to_string(),
-            position_length: 1,
-        };
-        let t2 = t1.clone();
-
-        assert_eq!(t1.position, t2.position);
-        assert_eq!(t1.offset_from, t2.offset_from);
-        assert_eq!(t1.offset_to, t2.offset_to);
-        assert_eq!(t1.text, t2.text);
-    }
-}
diff --git a/tokenizer-api/Cargo.toml b/tokenizer-api/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "tantivy-tokenizer-api"
+version = "0.1.0"
+edition = "2021"
+description = "Tokenizer API of tantivy"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+serde = { version = "1.0.152", features = ["derive"] }
diff --git a/tokenizer-api/README.md b/tokenizer-api/README.md
@@ -0,0 +1,6 @@
+
+#Tokenizer-API
+
+An API to interface a tokenizer with tantivy. 
+
+The API will be kept stable in order to not break support for existing tokenizers.