Skip to content

Commit

Permalink
expose helpers for json field writer manipulation
Browse files Browse the repository at this point in the history
  • Loading branch information
saroh committed May 4, 2022
1 parent 7e3c0c5 commit 845d809
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 43 deletions.
41 changes: 40 additions & 1 deletion src/indexer/json_term_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use murmurhash32::murmurhash2;
use crate::fastfield::FastValue;
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
use crate::schema::term::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP};
use crate::schema::Type;
use crate::schema::{Field, Type};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::{OffsetDateTime, UtcOffset};
use crate::tokenizer::TextAnalyzer;
Expand Down Expand Up @@ -199,12 +199,51 @@ fn infer_type_from_str(text: &str) -> TextOrDateTime {
}
}

// helper function to generate a Term from a json fastvalue
pub(crate) fn generate_term_from_json_writer<T: FastValue>(
json_term_writer: &mut JsonTermWriter,
value: T,
) -> Term {
json_term_writer.set_fast_value(value);
json_term_writer.term().clone()
}

// helper function to generate a list of terms with their positions from a textual json value
pub(crate) fn generate_terms_from_json_writer(
json_term_writer: &mut JsonTermWriter,
value: &str,
text_analyzer: &TextAnalyzer,
) -> Vec<(usize, Term)> {
let mut positions_and_terms = Vec::<(usize, Term)>::new();
json_term_writer.close_path_and_set_type(Type::Str);
let term_num_bytes = json_term_writer.term_buffer.as_slice().len();
let mut token_stream = text_analyzer.token_stream(value);
token_stream.process(&mut |token| {
json_term_writer.term_buffer.truncate(term_num_bytes);
json_term_writer
.term_buffer
.append_bytes(token.text.as_bytes());
positions_and_terms.push((token.position, json_term_writer.term().clone()));
});
positions_and_terms
}

pub struct JsonTermWriter<'a> {
term_buffer: &'a mut Term,
path_stack: Vec<usize>,
}

impl<'a> JsonTermWriter<'a> {
// Prepares for writing terms for a given field
pub fn initialize(field: Field, json_path: &str, term_buffer: &'a mut Term) -> Self {
term_buffer.set_field(Type::Json, field);
let mut json_term_writer = JsonTermWriter::wrap(term_buffer);
for segment in json_path.split('.') {
json_term_writer.push_path_segment(segment);
}
json_term_writer
}

pub fn wrap(term_buffer: &'a mut Term) -> Self {
term_buffer.clear_with_type(Type::Json);
let mut path_stack = Vec::with_capacity(10);
Expand Down
4 changes: 3 additions & 1 deletion src/indexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ use crossbeam::channel;
use smallvec::SmallVec;

pub use self::index_writer::IndexWriter;
pub(crate) use self::json_term_writer::JsonTermWriter;
pub(crate) use self::json_term_writer::{
generate_term_from_json_writer, generate_terms_from_json_writer, JsonTermWriter,
};
pub use self::log_merge_policy::LogMergePolicy;
pub use self::merge_operation::MergeOperation;
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
Expand Down
56 changes: 15 additions & 41 deletions src/query/query_parser/query_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ use tantivy_query_grammar::{UserInputAst, UserInputBound, UserInputLeaf, UserInp

use super::logical_ast::*;
use crate::core::Index;
use crate::indexer::JsonTermWriter;
use crate::indexer::{
generate_term_from_json_writer, generate_terms_from_json_writer, JsonTermWriter,
};
use crate::query::{
AllQuery, BooleanQuery, BoostQuery, EmptyQuery, Occur, PhraseQuery, Query, RangeQuery,
TermQuery,
Expand Down Expand Up @@ -660,26 +662,22 @@ fn generate_literals_for_str(
Ok(Some(LogicalLiteral::Phrase(terms)))
}

enum NumValue {
U64(u64),
I64(i64),
F64(f64),
DateTime(OffsetDateTime),
}

fn infer_type_num(phrase: &str) -> Option<NumValue> {
fn infer_fast_value_term(json_term_writer: &mut JsonTermWriter, phrase: &str) -> Option<Term> {
if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) {
let dt_utc = dt.to_offset(UtcOffset::UTC);
return Some(NumValue::DateTime(dt_utc));
return Some(generate_term_from_json_writer(
json_term_writer,
DateTime::from_utc(dt_utc),
));
}
if let Ok(u64_val) = str::parse::<u64>(phrase) {
return Some(NumValue::U64(u64_val));
return Some(generate_term_from_json_writer(json_term_writer, u64_val));
}
if let Ok(i64_val) = str::parse::<i64>(phrase) {
return Some(NumValue::I64(i64_val));
return Some(generate_term_from_json_writer(json_term_writer, i64_val));
}
if let Ok(f64_val) = str::parse::<f64>(phrase) {
return Some(NumValue::F64(f64_val));
return Some(generate_term_from_json_writer(json_term_writer, f64_val));
}
None
}
Expand All @@ -695,37 +693,13 @@ fn generate_literals_for_json_object(
let mut logical_literals = Vec::new();
let mut term = Term::new();
term.set_field(Type::Json, field);
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
for segment in json_path.split('.') {
json_term_writer.push_path_segment(segment);
}
if let Some(num_value) = infer_type_num(phrase) {
match num_value {
NumValue::U64(u64_val) => {
json_term_writer.set_fast_value(u64_val);
}
NumValue::I64(i64_val) => {
json_term_writer.set_fast_value(i64_val);
}
NumValue::F64(f64_val) => {
json_term_writer.set_fast_value(f64_val);
}
NumValue::DateTime(dt_val) => {
json_term_writer.set_fast_value(DateTime::from_utc(dt_val));
}
}
logical_literals.push(LogicalLiteral::Term(json_term_writer.term().clone()));
let mut json_term_writer = JsonTermWriter::initialize(field, json_path, &mut term);
if let Some(term) = infer_fast_value_term(&mut json_term_writer, phrase) {
logical_literals.push(LogicalLiteral::Term(term));
}
json_term_writer.close_path_and_set_type(Type::Str);
let terms = generate_terms_from_json_writer(&mut json_term_writer, phrase, text_analyzer);
drop(json_term_writer);
let term_num_bytes = term.as_slice().len();
let mut token_stream = text_analyzer.token_stream(phrase);
let mut terms: Vec<(usize, Term)> = Vec::new();
token_stream.process(&mut |token| {
term.truncate(term_num_bytes);
term.append_bytes(token.text.as_bytes());
terms.push((token.position, term.clone()));
});
if terms.len() <= 1 {
for (_, term) in terms {
logical_literals.push(LogicalLiteral::Term(term));
Expand Down

0 comments on commit 845d809

Please sign in to comment.