Skip to content

Commit

Permalink
Added handling of pre-tokenized text fields (quickwit-oss#642).
Browse files Browse the repository at this point in the history
  • Loading branch information
kkoziara committed Oct 21, 2019
1 parent f6c525b commit 5a29210
Show file tree
Hide file tree
Showing 13 changed files with 406 additions and 17 deletions.
124 changes: 124 additions & 0 deletions examples/pre_tokenized_text.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
// # Pre-tokenized text example
//
// This example shows how to use pre-tokenized text. Sometimes yout might
// want to index and search through text which is already split into
// tokens by some external tool.
//
// In this example we will:
// - use tantivy tokenizer to create tokens and load them directly into tantivy,
// - import tokenized text straight from json,
// - perform a search on documents with pre-tokenized text

use tantivy::tokenizer::{Token, Tokenizer, TokenStream, SimpleTokenizer, TokenizedString};

use tantivy::schema::*;
use tantivy::{doc, Index, ReloadPolicy};
use tempfile::TempDir;
use tantivy::query::TermQuery;
use tantivy::collector::{Count, TopDocs};

fn tokenize_it(text: &str) -> Vec<Token> {
let mut ts = SimpleTokenizer.token_stream(text);
let mut tokens = vec![];
while ts.advance() {
tokens.push(ts.token().clone());
}
tokens
}

fn main() -> tantivy::Result<()> {

let index_path = TempDir::new()?;

let mut schema_builder = Schema::builder();

// now we add `TOKENIZED` `TextOptions` to mark field as pre-tokenized
// in addition the title will be also stored, so we can see it in
// returned results
schema_builder.add_text_field("title", TEXT | STORED | TOKENIZED);
schema_builder.add_text_field("body", TEXT | TOKENIZED);

let schema = schema_builder.build();

let index = Index::create_in_dir(&index_path, schema.clone())?;

let mut index_writer = index.writer(50_000_000)?;

// We can create a document manually, by setting the fields
// one by one in a Document object.
let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap();

let title_text = "The Old Man and the Sea";
let body_text = "He was an old man who fished alone in a skiff in the Gulf Stream";

// Content of our first document
// We create `TokenizedString` which contains original text and vector of tokens
let title_tok = TokenizedString {
text: String::from(title_text),
tokens: tokenize_it(title_text)
};

println!("Original text: \"{}\" and tokens: {:?}", title_tok.text, title_tok.tokens);

let body_tok = TokenizedString {
text: String::from(body_text),
tokens: tokenize_it(body_text)
};

// Now lets create a document and add our `TokenizedString` using
// `add_tokenized_text` method of `Document`
let mut old_man_doc = Document::default();
old_man_doc.add_tokenized_text(title, &title_tok);
old_man_doc.add_tokenized_text(body, &body_tok);

// ... now let's just add it to the IndexWriter
index_writer.add_document(old_man_doc);


// `Document` can be obtained directly from JSON:
let short_man_json = r#"{
"title":[{
"text":"The Old Man",
"tokens":[
{"offset_from":0,"offset_to":3,"position":0,"text":"The","position_length":1},
{"offset_from":4,"offset_to":7,"position":1,"text":"Old","position_length":1},
{"offset_from":8,"offset_to":11,"position":2,"text":"Man","position_length":1}
]
}]
}"#;

let short_man_doc = schema.parse_document(&short_man_json)?;

index_writer.add_document(short_man_doc);


// Let's commit changes
index_writer.commit()?;

// ... and now is the time to query our index

let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()?;

let searcher = reader.searcher();

// We want to get documents with token "Man", we will use TermQuery to do it
let query = TermQuery::new(
Term::from_field_text(title, "Man"),
IndexRecordOption::Basic,
);

let (top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count)).unwrap();

println!("Docs counts: {}", count);

for (_score, doc_address) in top_docs {
let retrieved_doc = searcher.doc(doc_address)?;
println!("Document: {}", schema.to_json(&retrieved_doc));
}

Ok(())
}
2 changes: 1 addition & 1 deletion src/core/index_meta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,6 @@ mod tests {
payload: None,
};
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
assert_eq!(json, r#"{"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false}}],"opstamp":0}"#);
assert_eq!(json, r#"{"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false,"tokenized":false}}],"opstamp":0}"#);
}
}
38 changes: 27 additions & 11 deletions src/indexer/segment_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use crate::schema::FieldType;
use crate::schema::Schema;
use crate::schema::Term;
use crate::schema::Value;
use crate::tokenizer::{TokenizedString, TokenizedStream};
use crate::tokenizer::BoxedTokenizer;
use crate::tokenizer::FacetTokenizer;
use crate::tokenizer::{TokenStream, Tokenizer};
Expand Down Expand Up @@ -158,26 +159,41 @@ impl SegmentWriter {
}
}
}
FieldType::Str(_) => {
let num_tokens = if let Some(ref mut tokenizer) =
self.tokenizers[field.0 as usize]
{
let texts: Vec<&str> = field_values
FieldType::Str(ref text_options) => {
let num_tokens = if text_options.is_tokenized() {
let tok_strings: Vec<&TokenizedString> = field_values
.iter()
.flat_map(|field_value| match *field_value.value() {
Value::Str(ref text) => Some(text.as_str()),
Value::TokStr(ref tok_str) => Some(tok_str),
_ => None,
})
.collect();
if texts.is_empty() {
if tok_strings.is_empty() {
0
} else {
let mut token_stream = tokenizer.token_stream_texts(&texts[..]);
self.multifield_postings
.index_text(doc_id, field, &mut token_stream)
let mut token_stream = TokenizedStream::chain_tokenized_strings(&tok_strings[..]);
self.multifield_postings.index_text(doc_id, field, &mut token_stream)
}
} else {
0
if let Some(ref mut tokenizer) = self.tokenizers[field.0 as usize]
{
let texts: Vec<&str> = field_values
.iter()
.flat_map(|field_value| match *field_value.value() {
Value::Str(ref text) => Some(text.as_str()),
_ => None,
})
.collect();
if texts.is_empty() {
0
} else {
let mut token_stream = tokenizer.token_stream_texts(&texts[..]);
self.multifield_postings
.index_text(doc_id, field, &mut token_stream)
}
} else {
0
}
};
self.fieldnorms_writer.record(doc_id, field, num_tokens);
}
Expand Down
7 changes: 7 additions & 0 deletions src/schema/document.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use super::*;
use crate::tokenizer::TokenizedString;
use crate::common::BinarySerializable;
use crate::common::VInt;
use crate::DateTime;
Expand Down Expand Up @@ -78,6 +79,12 @@ impl Document {
self.add(FieldValue::new(field, value));
}

/// Add a text field with tokens.
pub fn add_tokenized_text(&mut self, field: Field, tokenized_text: &TokenizedString) {
let value = Value::TokStr(tokenized_text.clone());
self.add(FieldValue::new(field, value));
}

/// Add a u64 field
pub fn add_u64(&mut self, field: Field, value: u64) {
self.add(FieldValue::new(field, Value::U64(value)));
Expand Down
3 changes: 2 additions & 1 deletion src/schema/field_entry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,8 @@ mod tests {
"record": "position",
"tokenizer": "default"
},
"stored": false
"stored": false,
"tokenized": false
}
}"#;
let field_value_json = serde_json::to_string_pretty(&field_value).unwrap();
Expand Down
30 changes: 29 additions & 1 deletion src/schema/field_type.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use base64::decode;

use crate::schema::{IntOptions, TextOptions};

use crate::tokenizer::TokenizedString;
use crate::schema::Facet;
use crate::schema::IndexRecordOption;
use crate::schema::TextFieldIndexing;
Expand Down Expand Up @@ -169,6 +169,34 @@ impl FieldType {
Err(ValueParsingError::TypeError(msg))
}
},
JsonValue::Object(_) => match *self {
FieldType::Str(ref text_options) => {
if text_options.is_tokenized() {
if let Ok(tok_str_val) = serde_json::from_value::<TokenizedString>(json.clone()) {
Ok(Value::TokStr(tok_str_val))
} else {
let msg = format!(
"Json value {:?} cannot be translated to TokenizedString.",
json
);
Err(ValueParsingError::TypeError(msg))
}
} else {
let msg = format!(
"Json value not supported error {:?}. Expected {:?}",
json, self
);
Err(ValueParsingError::TypeError(msg))
}
}
_ => {
let msg = format!(
"Json value not supported error {:?}. Expected {:?}",
json, self
);
Err(ValueParsingError::TypeError(msg))
}
}
_ => {
let msg = format!(
"Json value not supported error {:?}. Expected {:?}",
Expand Down
1 change: 1 addition & 0 deletions src/schema/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ pub use self::text_options::TextFieldIndexing;
pub use self::text_options::TextOptions;
pub use self::text_options::STRING;
pub use self::text_options::TEXT;
pub use self::text_options::TOKENIZED;

pub use self::flags::{FAST, INDEXED, STORED};
pub use self::int_options::Cardinality;
Expand Down
6 changes: 4 additions & 2 deletions src/schema/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,8 @@ mod tests {
"record": "position",
"tokenizer": "default"
},
"stored": false
"stored": false,
"tokenized": false
}
},
{
Expand All @@ -450,7 +451,8 @@ mod tests {
"record": "basic",
"tokenizer": "raw"
},
"stored": false
"stored": false,
"tokenized": false
}
},
{
Expand Down
31 changes: 31 additions & 0 deletions src/schema/text_options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use std::ops::BitOr;
pub struct TextOptions {
indexing: Option<TextFieldIndexing>,
stored: bool,
tokenized: bool
}

impl TextOptions {
Expand All @@ -33,13 +34,26 @@ impl TextOptions {
self.indexing = Some(indexing);
self
}

/// Returns true if the text is already tokenized in the form of TokenString
pub fn is_tokenized(&self) -> bool {
self.tokenized
}

/// Sets the field as already tokenized
pub fn set_tokenized(mut self) -> TextOptions {
self.tokenized = true;
self
}

}

impl Default for TextOptions {
fn default() -> TextOptions {
TextOptions {
indexing: None,
stored: false,
tokenized: false,
}
}
}
Expand Down Expand Up @@ -100,6 +114,7 @@ pub const STRING: TextOptions = TextOptions {
record: IndexRecordOption::Basic,
}),
stored: false,
tokenized: false,
};

/// The field will be tokenized and indexed
Expand All @@ -109,6 +124,14 @@ pub const TEXT: TextOptions = TextOptions {
record: IndexRecordOption::WithFreqsAndPositions,
}),
stored: false,
tokenized: false,
};

/// The field is already tokenized, should come as TokenizedString
pub const TOKENIZED: TextOptions = TextOptions {
indexing: None,
stored: false,
tokenized: true,
};

impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
Expand All @@ -119,6 +142,7 @@ impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
let mut res = TextOptions::default();
res.indexing = self.indexing.or(other.indexing);
res.stored = self.stored | other.stored;
res.tokenized = self.tokenized | other.tokenized;
res
}
}
Expand All @@ -134,6 +158,7 @@ impl From<StoredFlag> for TextOptions {
TextOptions {
indexing: None,
stored: true,
tokenized: false,
}
}
}
Expand All @@ -158,8 +183,14 @@ mod tests {
{
let field_options = STORED | TEXT;
assert!(field_options.is_stored());
assert!(!field_options.is_tokenized());
assert!(field_options.get_indexing_options().is_some());
}
{
let field_options = STORED | TOKENIZED;
assert!(field_options.is_stored());
assert!(field_options.is_tokenized());
}
{
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("body", TEXT);
Expand Down
Loading

0 comments on commit 5a29210

Please sign in to comment.