Added handling of pre-tokenized text fields (quickwit-oss#642).

kkoziara · Oct 21, 2019 · 5a29210 · 5a29210
1 parent f6c525b
commit 5a29210
Show file tree

Hide file tree

Showing 13 changed files with 406 additions and 17 deletions.
diff --git a/examples/pre_tokenized_text.rs b/examples/pre_tokenized_text.rs
@@ -0,0 +1,124 @@
+// # Pre-tokenized text example
+//
+// This example shows how to use pre-tokenized text. Sometimes yout might
+// want to index and search through text which is already split into
+// tokens by some external tool.
+//
+// In this example we will:
+// - use tantivy tokenizer to create tokens and load them directly into tantivy,
+// - import tokenized text straight from json,
+// - perform a search on documents with pre-tokenized text
+
+use tantivy::tokenizer::{Token, Tokenizer, TokenStream, SimpleTokenizer, TokenizedString};
+
+use tantivy::schema::*;
+use tantivy::{doc, Index, ReloadPolicy};
+use tempfile::TempDir;
+use tantivy::query::TermQuery;
+use tantivy::collector::{Count, TopDocs};
+
+fn tokenize_it(text: &str) -> Vec<Token> {
+    let mut ts = SimpleTokenizer.token_stream(text);
+    let mut tokens = vec![];
+    while ts.advance() {
+        tokens.push(ts.token().clone());
+    }
+    tokens
+}
+
+fn main() -> tantivy::Result<()> {
+
+    let index_path = TempDir::new()?;
+
+    let mut schema_builder = Schema::builder();
+
+    // now we add `TOKENIZED` `TextOptions` to mark field as pre-tokenized
+    // in addition the title will be also stored, so we can see it in
+    // returned results
+    schema_builder.add_text_field("title", TEXT | STORED | TOKENIZED);
+    schema_builder.add_text_field("body", TEXT | TOKENIZED);
+
+    let schema = schema_builder.build();
+
+    let index = Index::create_in_dir(&index_path, schema.clone())?;
+
+    let mut index_writer = index.writer(50_000_000)?;
+
+    // We can create a document manually, by setting the fields
+    // one by one in a Document object.
+    let title = schema.get_field("title").unwrap();
+    let body = schema.get_field("body").unwrap();
+
+    let title_text = "The Old Man and the Sea";
+    let body_text = "He was an old man who fished alone in a skiff in the Gulf Stream";
+
+    // Content of our first document
+    // We create `TokenizedString` which contains original text and vector of tokens
+    let title_tok = TokenizedString {
+        text: String::from(title_text),
+        tokens: tokenize_it(title_text)
+    };
+
+    println!("Original text: \"{}\" and tokens: {:?}", title_tok.text, title_tok.tokens);
+
+    let body_tok = TokenizedString {
+        text: String::from(body_text),
+        tokens: tokenize_it(body_text)
+    };
+
+    // Now lets create a document and add our `TokenizedString` using
+    // `add_tokenized_text` method of `Document`
+    let mut old_man_doc = Document::default();
+    old_man_doc.add_tokenized_text(title, &title_tok);
+    old_man_doc.add_tokenized_text(body, &body_tok);
+
+    // ... now let's just add it to the IndexWriter
+    index_writer.add_document(old_man_doc);
+
+
+    // `Document` can be obtained directly from JSON:
+    let short_man_json = r#"{
+        "title":[{
+            "text":"The Old Man",
+            "tokens":[
+                {"offset_from":0,"offset_to":3,"position":0,"text":"The","position_length":1},
+                {"offset_from":4,"offset_to":7,"position":1,"text":"Old","position_length":1},
+                {"offset_from":8,"offset_to":11,"position":2,"text":"Man","position_length":1}
+            ]
+        }]
+    }"#;
+
+    let short_man_doc = schema.parse_document(&short_man_json)?;
+
+    index_writer.add_document(short_man_doc);
+
+
+    // Let's commit changes
+    index_writer.commit()?;
+
+    // ... and now is the time to query our index
+
+    let reader = index
+        .reader_builder()
+        .reload_policy(ReloadPolicy::OnCommit)
+        .try_into()?;
+
+    let searcher = reader.searcher();
+
+    // We want to get documents with token "Man", we will use TermQuery to do it
+    let query = TermQuery::new(
+        Term::from_field_text(title, "Man"),
+        IndexRecordOption::Basic,
+    );
+
+    let (top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count)).unwrap();
+
+    println!("Docs counts: {}", count);
+
+    for (_score, doc_address) in top_docs {
+        let retrieved_doc = searcher.doc(doc_address)?;
+        println!("Document: {}", schema.to_json(&retrieved_doc));
+    }
+
+    Ok(())
+}
diff --git a/src/core/index_meta.rs b/src/core/index_meta.rs
@@ -285,6 +285,6 @@ mod tests {
             payload: None,
         };
         let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
-        assert_eq!(json, r#"{"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false}}],"opstamp":0}"#);
+        assert_eq!(json, r#"{"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false,"tokenized":false}}],"opstamp":0}"#);
     }
 }
diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs
@@ -11,6 +11,7 @@ use crate::schema::FieldType;
 use crate::schema::Schema;
 use crate::schema::Term;
 use crate::schema::Value;
+use crate::tokenizer::{TokenizedString, TokenizedStream};
 use crate::tokenizer::BoxedTokenizer;
 use crate::tokenizer::FacetTokenizer;
 use crate::tokenizer::{TokenStream, Tokenizer};
@@ -158,26 +159,41 @@ impl SegmentWriter {
                         }
                     }
                 }
-                FieldType::Str(_) => {
-                    let num_tokens = if let Some(ref mut tokenizer) =
-                        self.tokenizers[field.0 as usize]
-                    {
-                        let texts: Vec<&str> = field_values
+                FieldType::Str(ref text_options) => {
+                    let num_tokens = if text_options.is_tokenized() {
+                        let tok_strings: Vec<&TokenizedString> = field_values
                             .iter()
                             .flat_map(|field_value| match *field_value.value() {
-                                Value::Str(ref text) => Some(text.as_str()),
+                                Value::TokStr(ref tok_str) => Some(tok_str),
                                 _ => None,
                             })
                             .collect();
-                        if texts.is_empty() {
+                        if tok_strings.is_empty() {
                             0
                         } else {
-                            let mut token_stream = tokenizer.token_stream_texts(&texts[..]);
-                            self.multifield_postings
-                                .index_text(doc_id, field, &mut token_stream)
+                            let mut token_stream = TokenizedStream::chain_tokenized_strings(&tok_strings[..]);
+                            self.multifield_postings.index_text(doc_id, field, &mut token_stream)
                         }
                     } else {
-                        0
+                        if let Some(ref mut tokenizer) = self.tokenizers[field.0 as usize]
+                        {
+                            let texts: Vec<&str> = field_values
+                                .iter()
+                                .flat_map(|field_value| match *field_value.value() {
+                                    Value::Str(ref text) => Some(text.as_str()),
+                                    _ => None,
+                                })
+                                .collect();
+                            if texts.is_empty() {
+                                0
+                            } else {
+                                let mut token_stream = tokenizer.token_stream_texts(&texts[..]);
+                                self.multifield_postings
+                                    .index_text(doc_id, field, &mut token_stream)
+                            }
+                        } else {
+                            0
+                        }
                     };
                     self.fieldnorms_writer.record(doc_id, field, num_tokens);
                 }

diff --git a/src/schema/document.rs b/src/schema/document.rs
@@ -1,4 +1,5 @@
 use super::*;
+use crate::tokenizer::TokenizedString;
 use crate::common::BinarySerializable;
 use crate::common::VInt;
 use crate::DateTime;
@@ -78,6 +79,12 @@ impl Document {
         self.add(FieldValue::new(field, value));
     }
 
+    /// Add a text field with tokens.
+    pub fn add_tokenized_text(&mut self, field: Field, tokenized_text: &TokenizedString) {
+        let value = Value::TokStr(tokenized_text.clone());
+        self.add(FieldValue::new(field, value));
+    }
+
     /// Add a u64 field
     pub fn add_u64(&mut self, field: Field, value: u64) {
         self.add(FieldValue::new(field, Value::U64(value)));

diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs
@@ -280,7 +280,8 @@ mod tests {
       "record": "position",
       "tokenizer": "default"
     },
-    "stored": false
+    "stored": false,
+    "tokenized": false
   }
 }"#;
         let field_value_json = serde_json::to_string_pretty(&field_value).unwrap();

diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs
@@ -1,7 +1,7 @@
 use base64::decode;
 
 use crate::schema::{IntOptions, TextOptions};
-
+use crate::tokenizer::TokenizedString;
 use crate::schema::Facet;
 use crate::schema::IndexRecordOption;
 use crate::schema::TextFieldIndexing;
@@ -169,6 +169,34 @@ impl FieldType {
                     Err(ValueParsingError::TypeError(msg))
                 }
             },
+            JsonValue::Object(_) => match *self {
+                FieldType::Str(ref text_options) => {
+                    if text_options.is_tokenized() {
+                        if let Ok(tok_str_val) = serde_json::from_value::<TokenizedString>(json.clone()) {
+                            Ok(Value::TokStr(tok_str_val))
+                        } else {
+                            let msg = format!(
+                                "Json value {:?} cannot be translated to TokenizedString.",
+                                json
+                            );
+                            Err(ValueParsingError::TypeError(msg))
+                        }
+                    } else {
+                         let msg = format!(
+                            "Json value not supported error {:?}. Expected {:?}",
+                            json, self
+                        );
+                        Err(ValueParsingError::TypeError(msg))
+                    }
+                }
+                _ => {
+                    let msg = format!(
+                        "Json value not supported error {:?}. Expected {:?}",
+                        json, self
+                    );
+                    Err(ValueParsingError::TypeError(msg))
+                }
+            }
             _ => {
                 let msg = format!(
                     "Json value not supported error {:?}. Expected {:?}",

diff --git a/src/schema/mod.rs b/src/schema/mod.rs
@@ -141,6 +141,7 @@ pub use self::text_options::TextFieldIndexing;
 pub use self::text_options::TextOptions;
 pub use self::text_options::STRING;
 pub use self::text_options::TEXT;
+pub use self::text_options::TOKENIZED;
 
 pub use self::flags::{FAST, INDEXED, STORED};
 pub use self::int_options::Cardinality;

diff --git a/src/schema/schema.rs b/src/schema/schema.rs
@@ -439,7 +439,8 @@ mod tests {
         "record": "position",
         "tokenizer": "default"
       },
-      "stored": false
+      "stored": false,
+      "tokenized": false
     }
   },
   {
@@ -450,7 +451,8 @@ mod tests {
         "record": "basic",
         "tokenizer": "raw"
       },
-      "stored": false
+      "stored": false,
+      "tokenized": false
     }
   },
   {

diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs
@@ -9,6 +9,7 @@ use std::ops::BitOr;
 pub struct TextOptions {
     indexing: Option<TextFieldIndexing>,
     stored: bool,
+    tokenized: bool
 }
 
 impl TextOptions {
@@ -33,13 +34,26 @@ impl TextOptions {
         self.indexing = Some(indexing);
         self
     }
+
+    /// Returns true if the text is already tokenized in the form of TokenString
+    pub fn is_tokenized(&self) -> bool {
+        self.tokenized
+    }
+
+    /// Sets the field as already tokenized
+    pub fn set_tokenized(mut self) -> TextOptions {
+        self.tokenized = true;
+        self
+    } 
+
 }
 
 impl Default for TextOptions {
     fn default() -> TextOptions {
         TextOptions {
             indexing: None,
             stored: false,
+            tokenized: false,
         }
     }
 }
@@ -100,6 +114,7 @@ pub const STRING: TextOptions = TextOptions {
         record: IndexRecordOption::Basic,
     }),
     stored: false,
+    tokenized: false,
 };
 
 /// The field will be tokenized and indexed
@@ -109,6 +124,14 @@ pub const TEXT: TextOptions = TextOptions {
         record: IndexRecordOption::WithFreqsAndPositions,
     }),
     stored: false,
+    tokenized: false,
+};
+
+/// The field is already tokenized, should come as TokenizedString
+pub const TOKENIZED: TextOptions = TextOptions {
+    indexing: None,
+    stored: false,
+    tokenized: true,
 };
 
 impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
@@ -119,6 +142,7 @@ impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
         let mut res = TextOptions::default();
         res.indexing = self.indexing.or(other.indexing);
         res.stored = self.stored | other.stored;
+        res.tokenized = self.tokenized | other.tokenized;
         res
     }
 }
@@ -134,6 +158,7 @@ impl From<StoredFlag> for TextOptions {
         TextOptions {
             indexing: None,
             stored: true,
+            tokenized: false,
         }
     }
 }
@@ -158,8 +183,14 @@ mod tests {
         {
             let field_options = STORED | TEXT;
             assert!(field_options.is_stored());
+            assert!(!field_options.is_tokenized());
             assert!(field_options.get_indexing_options().is_some());
         }
+        {
+            let field_options = STORED | TOKENIZED;
+            assert!(field_options.is_stored());
+            assert!(field_options.is_tokenized());
+        }
         {
             let mut schema_builder = Schema::builder();
             schema_builder.add_text_field("body", TEXT);