Skip to content

Commit

Permalink
tokenizer-api: reduce Tokenizer overhead
Browse files Browse the repository at this point in the history
Previously a new `Token` for each text encountered was created, which
contains `String::with_capacity(200)`
In the new API the token_stream gets mutable access to the tokenizer,
this allows state to be shared (in this PR Token is shared).
Ideally the allocation for the BoxTokenStream would also be removed, but
this may require some lifetime tricks.
  • Loading branch information
PSeitz committed May 31, 2023
1 parent 3af4569 commit 8a59b38
Show file tree
Hide file tree
Showing 28 changed files with 261 additions and 197 deletions.
3 changes: 2 additions & 1 deletion examples/pre_tokenized_text.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ use tantivy::{doc, Index, ReloadPolicy};
use tempfile::TempDir;

fn pre_tokenize_text(text: &str) -> Vec<Token> {
let mut token_stream = SimpleTokenizer.token_stream(text);
let mut tokenizer = SimpleTokenizer::default();
let mut token_stream = tokenizer.token_stream(text);
let mut tokens = vec![];
while token_stream.advance() {
tokens.push(token_stream.token().clone());
Expand Down
2 changes: 1 addition & 1 deletion examples/stop_words.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ fn main() -> tantivy::Result<()> {

// This tokenizer lowers all of the text (to help with stop word matching)
// then removes all instances of `the` and `and` from the corpus
let tokenizer = TextAnalyzer::builder(SimpleTokenizer)
let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
.filter(LowerCaser)
.filter(StopWordFilter::remove(vec![
"the".to_string(),
Expand Down
8 changes: 4 additions & 4 deletions src/core/json_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ impl IndexingPositionsPerPath {
pub(crate) fn index_json_values<'a>(
doc: DocId,
json_values: impl Iterator<Item = crate::Result<&'a serde_json::Map<String, serde_json::Value>>>,
text_analyzer: &TextAnalyzer,
text_analyzer: &mut TextAnalyzer,
expand_dots_enabled: bool,
term_buffer: &mut Term,
postings_writer: &mut dyn PostingsWriter,
Expand All @@ -93,7 +93,7 @@ pub(crate) fn index_json_values<'a>(
fn index_json_object(
doc: DocId,
json_value: &serde_json::Map<String, serde_json::Value>,
text_analyzer: &TextAnalyzer,
text_analyzer: &mut TextAnalyzer,
json_term_writer: &mut JsonTermWriter,
postings_writer: &mut dyn PostingsWriter,
ctx: &mut IndexingContext,
Expand All @@ -117,7 +117,7 @@ fn index_json_object(
fn index_json_value(
doc: DocId,
json_value: &serde_json::Value,
text_analyzer: &TextAnalyzer,
text_analyzer: &mut TextAnalyzer,
json_term_writer: &mut JsonTermWriter,
postings_writer: &mut dyn PostingsWriter,
ctx: &mut IndexingContext,
Expand Down Expand Up @@ -239,7 +239,7 @@ pub(crate) fn set_fastvalue_and_get_term<T: FastValue>(
pub(crate) fn set_string_and_get_terms(
json_term_writer: &mut JsonTermWriter,
value: &str,
text_analyzer: &TextAnalyzer,
text_analyzer: &mut TextAnalyzer,
) -> Vec<(usize, Term)> {
let mut positions_and_terms = Vec::<(usize, Term)>::new();
json_term_writer.close_path_and_set_type(Type::Str);
Expand Down
2 changes: 1 addition & 1 deletion src/fastfield/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1208,7 +1208,7 @@ mod tests {
let ff_tokenizer_manager = TokenizerManager::default();
ff_tokenizer_manager.register(
"custom_lowercase",
TextAnalyzer::builder(RawTokenizer)
TextAnalyzer::builder(RawTokenizer::default())
.filter(LowerCaser)
.build(),
);
Expand Down
12 changes: 6 additions & 6 deletions src/fastfield/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ impl FastFieldsWriter {
}
Value::Str(text_val) => {
if let Some(tokenizer) =
&self.per_field_tokenizer[field_value.field().field_id() as usize]
&mut self.per_field_tokenizer[field_value.field().field_id() as usize]
{
let mut token_stream = tokenizer.token_stream(text_val);
token_stream.process(&mut |token: &Token| {
Expand Down Expand Up @@ -202,7 +202,7 @@ impl FastFieldsWriter {
self.json_path_buffer.push_str(field_name);

let text_analyzer =
&self.per_field_tokenizer[field_value.field().field_id() as usize];
&mut self.per_field_tokenizer[field_value.field().field_id() as usize];

record_json_obj_to_columnar_writer(
doc_id,
Expand Down Expand Up @@ -263,7 +263,7 @@ fn record_json_obj_to_columnar_writer(
remaining_depth_limit: usize,
json_path_buffer: &mut String,
columnar_writer: &mut columnar::ColumnarWriter,
tokenizer: &Option<TextAnalyzer>,
tokenizer: &mut Option<TextAnalyzer>,
) {
for (key, child) in json_obj {
let len_path = json_path_buffer.len();
Expand Down Expand Up @@ -302,7 +302,7 @@ fn record_json_value_to_columnar_writer(
mut remaining_depth_limit: usize,
json_path_writer: &mut String,
columnar_writer: &mut columnar::ColumnarWriter,
tokenizer: &Option<TextAnalyzer>,
tokenizer: &mut Option<TextAnalyzer>,
) {
if remaining_depth_limit == 0 {
return;
Expand All @@ -321,7 +321,7 @@ fn record_json_value_to_columnar_writer(
}
}
serde_json::Value::String(text) => {
if let Some(text_analyzer) = tokenizer {
if let Some(text_analyzer) = tokenizer.as_mut() {
let mut token_stream = text_analyzer.token_stream(text);
token_stream.process(&mut |token| {
columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text);
Expand Down Expand Up @@ -379,7 +379,7 @@ mod tests {
JSON_DEPTH_LIMIT,
&mut json_path,
&mut columnar_writer,
&None,
&mut None,
);
}
let mut buffer = Vec::new();
Expand Down
8 changes: 5 additions & 3 deletions src/indexer/segment_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -185,10 +185,11 @@ impl SegmentWriter {

match field_entry.field_type() {
FieldType::Facet(_) => {
let mut facet_tokenizer = FacetTokenizer::default(); // this can be global
for value in values {
let facet = value.as_facet().ok_or_else(make_schema_error)?;
let facet_str = facet.encoded_str();
let mut facet_tokenizer = FacetTokenizer.token_stream(facet_str);
let mut facet_tokenizer = facet_tokenizer.token_stream(facet_str);
let mut indexing_position = IndexingPosition::default();
postings_writer.index_text(
doc_id,
Expand All @@ -208,7 +209,7 @@ impl SegmentWriter {
}
Value::Str(ref text) => {
let text_analyzer =
&self.per_field_text_analyzers[field.field_id() as usize];
&mut self.per_field_text_analyzers[field.field_id() as usize];
text_analyzer.token_stream(text)
}
_ => {
Expand Down Expand Up @@ -304,7 +305,8 @@ impl SegmentWriter {
}
}
FieldType::JsonObject(json_options) => {
let text_analyzer = &self.per_field_text_analyzers[field.field_id() as usize];
let text_analyzer =
&mut self.per_field_text_analyzers[field.field_id() as usize];
let json_values_it =
values.map(|value| value.as_json().ok_or_else(make_schema_error));
index_json_values(
Expand Down
4 changes: 2 additions & 2 deletions src/postings/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ pub mod tests {
let index = Index::create_in_ram(schema);
index
.tokenizers()
.register("simple_no_truncation", SimpleTokenizer);
.register("simple_no_truncation", SimpleTokenizer::default());
let reader = index.reader()?;
let mut index_writer = index.writer_for_tests()?;

Expand Down Expand Up @@ -194,7 +194,7 @@ pub mod tests {
let index = Index::create_in_ram(schema);
index
.tokenizers()
.register("simple_no_truncation", SimpleTokenizer);
.register("simple_no_truncation", SimpleTokenizer::default());
let reader = index.reader()?;
let mut index_writer = index.writer_for_tests()?;

Expand Down
44 changes: 24 additions & 20 deletions src/query/more_like_this/more_like_this.rs
Original file line number Diff line number Diff line change
Expand Up @@ -192,45 +192,49 @@ impl MoreLikeThis {
})
.collect::<Result<Vec<_>>>()?;
for fake_str in facets {
FacetTokenizer.token_stream(fake_str).process(&mut |token| {
if self.is_noise_word(token.text.clone()) {
let term = Term::from_field_text(field, &token.text);
*term_frequencies.entry(term).or_insert(0) += 1;
}
});
FacetTokenizer::default()
.token_stream(fake_str)
.process(&mut |token| {
if self.is_noise_word(token.text.clone()) {
let term = Term::from_field_text(field, &token.text);
*term_frequencies.entry(term).or_insert(0) += 1;
}
});
}
}
FieldType::Str(text_options) => {
let mut token_streams: Vec<BoxTokenStream> = vec![];

for value in values {
match value {
Value::PreTokStr(tok_str) => {
token_streams.push(PreTokenizedStream::from(tok_str.clone()).into());
let mut token_stream: BoxTokenStream =
PreTokenizedStream::from(tok_str.clone()).into();
token_stream.process(&mut |token| {
if !self.is_noise_word(token.text.clone()) {
let term = Term::from_field_text(field, &token.text);
*term_frequencies.entry(term).or_insert(0) += 1;
}
});
}
Value::Str(ref text) => {
if let Some(tokenizer) = text_options
if let Some(mut tokenizer) = text_options
.get_indexing_options()
.map(|text_indexing_options| {
text_indexing_options.tokenizer().to_string()
})
.and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name))
{
token_streams.push(tokenizer.token_stream(text));
let mut token_stream = tokenizer.token_stream(text);
token_stream.process(&mut |token| {
if !self.is_noise_word(token.text.clone()) {
let term = Term::from_field_text(field, &token.text);
*term_frequencies.entry(term).or_insert(0) += 1;
}
});
}
}
_ => (),
}
}

for mut token_stream in token_streams {
token_stream.process(&mut |token| {
if !self.is_noise_word(token.text.clone()) {
let term = Term::from_field_text(field, &token.text);
*term_frequencies.entry(term).or_insert(0) += 1;
}
});
}
}
FieldType::U64(_) => {
for value in values {
Expand Down
16 changes: 8 additions & 8 deletions src/query/query_parser/query_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ impl QueryParser {
// This should have been seen earlier really.
QueryParserError::FieldNotIndexed(field_entry.name().to_string())
})?;
let text_analyzer =
let mut text_analyzer =
self.tokenizer_manager
.get(option.tokenizer())
.ok_or_else(|| QueryParserError::UnknownTokenizer {
Expand Down Expand Up @@ -490,7 +490,7 @@ impl QueryParser {
// This should have been seen earlier really.
QueryParserError::FieldNotIndexed(field_name.to_string())
})?;
let text_analyzer =
let mut text_analyzer =
self.tokenizer_manager
.get(option.tokenizer())
.ok_or_else(|| QueryParserError::UnknownTokenizer {
Expand All @@ -503,7 +503,7 @@ impl QueryParser {
field,
phrase,
slop,
&text_analyzer,
&mut text_analyzer,
index_record_option,
)?
.into_iter()
Expand Down Expand Up @@ -774,7 +774,7 @@ fn generate_literals_for_str(
field: Field,
phrase: &str,
slop: u32,
text_analyzer: &TextAnalyzer,
text_analyzer: &mut TextAnalyzer,
index_record_option: IndexRecordOption,
) -> Result<Option<LogicalLiteral>, QueryParserError> {
let mut terms: Vec<(usize, Term)> = Vec::new();
Expand Down Expand Up @@ -810,7 +810,7 @@ fn generate_literals_for_json_object(
// This should have been seen earlier really.
QueryParserError::FieldNotIndexed(field_name.to_string())
})?;
let text_analyzer = tokenizer_manager
let mut text_analyzer = tokenizer_manager
.get(text_options.tokenizer())
.ok_or_else(|| QueryParserError::UnknownTokenizer {
field: field_name.to_string(),
Expand All @@ -828,7 +828,7 @@ fn generate_literals_for_json_object(
if let Some(term) = convert_to_fast_value_and_get_term(&mut json_term_writer, phrase) {
logical_literals.push(LogicalLiteral::Term(term));
}
let terms = set_string_and_get_terms(&mut json_term_writer, phrase, &text_analyzer);
let terms = set_string_and_get_terms(&mut json_term_writer, phrase, &mut text_analyzer);
drop(json_term_writer);
if terms.len() <= 1 {
for (_, term) in terms {
Expand Down Expand Up @@ -925,7 +925,7 @@ mod test {
let tokenizer_manager = TokenizerManager::default();
tokenizer_manager.register(
"en_with_stop_words",
TextAnalyzer::builder(SimpleTokenizer)
TextAnalyzer::builder(SimpleTokenizer::default())
.filter(LowerCaser)
.filter(StopWordFilter::remove(vec!["the".to_string()]))
.build(),
Expand Down Expand Up @@ -1429,7 +1429,7 @@ mod test {
let index = Index::create_in_ram(schema);
index
.tokenizers()
.register("customtokenizer", SimpleTokenizer);
.register("customtokenizer", SimpleTokenizer::default());
let query_parser = QueryParser::for_index(&index, vec![title]);
assert_eq!(
query_parser.parse_query("title:\"happy tax\"").unwrap_err(),
Expand Down
Loading

0 comments on commit 8a59b38

Please sign in to comment.