Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bump tantivy version, and add phrase prefix query support. #3543

Merged
merged 1 commit into from
Jun 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions quickwit/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion quickwit/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ quickwit-serve = { version = "0.6.0", path = "./quickwit-serve" }
quickwit-storage = { version = "0.6.0", path = "./quickwit-storage" }
quickwit-telemetry = { version = "0.6.0", path = "./quickwit-telemetry" }

tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "7ee78bd", default-features = false, features = [
tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "924fc70", default-features = false, features = [
"mmap",
"lz4-compression",
"zstd-compression",
Expand Down
2 changes: 1 addition & 1 deletion quickwit/quickwit-doc-mapper/src/doc_mapper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,7 @@ mod tests {
let (query, _) = doc_mapper.query(schema, &query_ast, true).unwrap();
assert_eq!(
format!("{query:?}"),
r#"BooleanQuery { subqueries: [(Should, TermQuery(Term(field=0, type=Json, path=toto, type=U64, 5))), (Should, TermQuery(Term(field=0, type=Json, path=toto, type=Str, "5")))] }"#
r#"BooleanQuery { subqueries: [(Should, TermQuery(Term(field=0, type=Json, path=toto, type=I64, 5))), (Should, TermQuery(Term(field=0, type=Json, path=toto, type=Str, "5")))] }"#
);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ impl ConvertableToQueryAst for MatchPhrasePrefix {
let phrase_prefix_query_ast = query_ast::PhrasePrefixQuery {
field: self.field,
phrase: query,
analyzer,
params: analyzer,
max_expansions,
};
Ok(phrase_prefix_query_ast.into())
Expand Down
4 changes: 2 additions & 2 deletions quickwit/quickwit-query/src/query_ast/full_text_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ impl FullTextParams {
let text_indexing_options = json_options
.get_text_indexing_options()
.with_context(|| format!("Json field text `{}` is not indexed", json_path))?;
let text_analyzer: TextAnalyzer = self.text_analyzer(text_indexing_options)?;
let mut text_analyzer: TextAnalyzer = self.text_analyzer(text_indexing_options)?;
let mut token_stream: BoxTokenStream = text_analyzer.token_stream(text);
let mut tokens = Vec::new();
let mut term = Term::with_capacity(100);
Expand All @@ -91,7 +91,7 @@ impl FullTextParams {
text: &str,
text_field_indexing: &TextFieldIndexing,
) -> anyhow::Result<Vec<(usize, Term)>> {
let text_analyzer: TextAnalyzer = self.text_analyzer(text_field_indexing)?;
let mut text_analyzer: TextAnalyzer = self.text_analyzer(text_field_indexing)?;
let mut token_stream: BoxTokenStream = text_analyzer.token_stream(text);
let mut tokens = Vec::new();
token_stream.process(&mut |token| {
Expand Down
8 changes: 4 additions & 4 deletions quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ pub struct PhrasePrefixQuery {
pub field: String,
pub phrase: String,
pub max_expansions: u32,
pub analyzer: FullTextParams,
pub params: FullTextParams,
}

impl PhrasePrefixQuery {
Expand All @@ -63,7 +63,7 @@ impl PhrasePrefixQuery {
));
}

let terms = self.analyzer.tokenize_text_into_terms(
let terms = self.params.tokenize_text_into_terms(
field,
&self.phrase,
text_field_indexing,
Expand All @@ -85,7 +85,7 @@ impl PhrasePrefixQuery {
.to_string(),
));
}
let terms = self.analyzer.tokenize_text_into_terms_json(
let terms = self.params.tokenize_text_into_terms_json(
field,
json_path,
&self.phrase,
Expand Down Expand Up @@ -116,7 +116,7 @@ impl BuildTantivyAst for PhrasePrefixQuery {
let (_, terms) = self.get_terms(schema)?;

if terms.is_empty() {
if self.analyzer.zero_terms_query.is_none() {
if self.params.zero_terms_query.is_none() {
Ok(TantivyQueryAst::match_none())
} else {
Ok(TantivyQueryAst::match_all())
Expand Down
31 changes: 31 additions & 0 deletions quickwit/quickwit-query/src/query_ast/user_input_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ use crate::query_ast::tantivy_query_ast::TantivyQueryAst;
use crate::query_ast::{self, BuildTantivyAst, FullTextMode, FullTextParams, QueryAst};
use crate::{BooleanOperand, InvalidQuery, JsonLiteral};

const DEFAULT_PHRASE_QUERY_MAX_EXPANSION: u32 = 50;

/// A query expressed in the tantivy query grammar DSL.
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
pub struct UserInputQuery {
Expand Down Expand Up @@ -182,6 +184,7 @@ fn convert_user_input_literal(
let UserInputLiteral {
field_name,
phrase,
prefix,
delimiter,
slop,
} = user_input_literal;
Expand Down Expand Up @@ -211,6 +214,15 @@ fn convert_user_input_literal(
let mut phrase_queries: Vec<QueryAst> = field_names
.into_iter()
.map(|field_name| {
if prefix {
return query_ast::PhrasePrefixQuery {
field: field_name,
phrase: phrase.clone(),
params: full_text_params.clone(),
max_expansions: DEFAULT_PHRASE_QUERY_MAX_EXPANSION,
}
.into();
}
query_ast::FullTextQuery {
field: field_name,
text: phrase.clone(),
Expand Down Expand Up @@ -309,6 +321,25 @@ mod tests {
);
}

#[test]
fn test_user_input_query_phrase_with_prefix() {
let ast = UserInputQuery {
user_text: "field:\"hello\"*".to_string(),
default_fields: None,
default_operator: BooleanOperand::And,
}
.parse_user_query(&[])
.unwrap();
let QueryAst::PhrasePrefix(phrase_prefix_query) = ast else { panic!() };
assert_eq!(&phrase_prefix_query.field, "field");
assert_eq!(&phrase_prefix_query.phrase, "hello");
assert_eq!(phrase_prefix_query.max_expansions, 50);
assert_eq!(
phrase_prefix_query.params.mode,
FullTextMode::Phrase { slop: 0 }
);
}

#[test]
fn test_user_input_query_override_default_fields() {
let ast = UserInputQuery {
Expand Down
42 changes: 24 additions & 18 deletions quickwit/quickwit-query/src/tokenizers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ use tantivy::tokenizer::{
};

fn create_quickwit_tokenizer_manager() -> TokenizerManager {
let raw_tokenizer = TextAnalyzer::builder(RawTokenizer)
let raw_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
.filter(RemoveLongFilter::limit(255))
.build();

Expand All @@ -41,14 +41,14 @@ fn create_quickwit_tokenizer_manager() -> TokenizerManager {

tokenizer_manager.register(
"default",
TextAnalyzer::builder(tantivy::tokenizer::SimpleTokenizer)
TextAnalyzer::builder(tantivy::tokenizer::SimpleTokenizer::default())
.filter(RemoveLongFilter::limit(255))
.filter(LowerCaser)
.build(),
);
tokenizer_manager.register(
"en_stem",
TextAnalyzer::builder(tantivy::tokenizer::SimpleTokenizer)
TextAnalyzer::builder(tantivy::tokenizer::SimpleTokenizer::default())
.filter(RemoveLongFilter::limit(255))
.filter(LowerCaser)
.filter(tantivy::tokenizer::Stemmer::new(
Expand All @@ -61,11 +61,11 @@ fn create_quickwit_tokenizer_manager() -> TokenizerManager {
}

fn create_quickwit_fastfield_normalizer_manager() -> TokenizerManager {
let raw_tokenizer = TextAnalyzer::builder(RawTokenizer)
let raw_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
.filter(RemoveLongFilter::limit(255))
.build();

let lower_case_tokenizer = TextAnalyzer::builder(RawTokenizer)
let lower_case_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
.filter(LowerCaser)
.filter(RemoveLongFilter::limit(255))
.build();
Expand All @@ -82,7 +82,7 @@ struct ChineseTokenizer;
impl Tokenizer for ChineseTokenizer {
type TokenStream<'a> = ChineseTokenStream<'a>;

fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
fn token_stream<'a>(&mut self, text: &'a str) -> Self::TokenStream<'a> {
ChineseTokenStream {
text,
last_char: None,
Expand Down Expand Up @@ -209,21 +209,27 @@ mod tests {
sand in my face
"#;

let tokenizer = get_quickwit_tokenizer_manager().get("raw").unwrap();
let mut haiku_stream = tokenizer.token_stream(my_haiku);
assert!(haiku_stream.advance());
assert!(!haiku_stream.advance());
let my_too_long_text = vec!["a".repeat(255)].join("");
assert!(!tokenizer.token_stream(&my_too_long_text).advance());
let my_long_text = vec!["a".repeat(254)].join("");
assert!(tokenizer.token_stream(&my_long_text).advance());
let mut tokenizer = get_quickwit_tokenizer_manager().get("raw").unwrap();
{
let mut haiku_stream = tokenizer.token_stream(my_haiku);
assert!(haiku_stream.advance());
assert!(!haiku_stream.advance());
}
{
let my_too_long_text = vec!["a".repeat(255)].join("");
assert!(!tokenizer.token_stream(&my_too_long_text).advance());
}
{
let my_long_text = vec!["a".repeat(254)].join("");
assert!(tokenizer.token_stream(&my_long_text).advance());
}
}

#[test]
fn test_chinese_tokenizer() {
let text = "Hello world, 你好世界, bonjour monde";

let tokenizer = get_quickwit_tokenizer_manager()
let mut tokenizer = get_quickwit_tokenizer_manager()
.get("chinese_compatible")
.unwrap();
let mut text_stream = tokenizer.token_stream(text);
Expand Down Expand Up @@ -300,7 +306,7 @@ mod tests {
fn test_chinese_tokenizer_no_space() {
let text = "Hello你好bonjour";

let tokenizer = get_quickwit_tokenizer_manager()
let mut tokenizer = get_quickwit_tokenizer_manager()
.get("chinese_compatible")
.unwrap();
let mut text_stream = tokenizer.token_stream(text);
Expand Down Expand Up @@ -347,8 +353,8 @@ mod tests {
proptest::proptest! {
#[test]
fn test_proptest_ascii_default_chinese_equal(text in "[ -~]{0,64}") {
let cn_tok = get_quickwit_tokenizer_manager().get("chinese_compatible").unwrap();
let default_tok = get_quickwit_tokenizer_manager().get("default").unwrap();
let mut cn_tok = get_quickwit_tokenizer_manager().get("chinese_compatible").unwrap();
let mut default_tok = get_quickwit_tokenizer_manager().get("default").unwrap();

let mut text_stream = cn_tok.token_stream(&text);

Expand Down