Skip to content

Commit

Permalink
Add support for phrase slop in query language (#1393)
Browse files Browse the repository at this point in the history
Closes #1390
  • Loading branch information
saroh authored Jun 28, 2022
1 parent 8024ecf commit 437cd35
Show file tree
Hide file tree
Showing 6 changed files with 107 additions and 28 deletions.
2 changes: 1 addition & 1 deletion query-grammar/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ edition = "2018"
[dependencies]
combine = {version="4", default-features=false, features=[] }
once_cell = "1.7.2"
regex ={ version = "1.5.4", default-features = false, features = ["std"] }
regex ={ version = "1.5.4", default-features = false, features = ["std", "unicode"] }
63 changes: 49 additions & 14 deletions query-grammar/src/query_grammar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ use crate::Occur;
// Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to
// special characters.
const SPECIAL_CHARS: &[char] = &[
'+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '~', '!', '\\', '*', ' ',
'+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '!', '\\', '*', ' ',
];
const ESCAPED_SPECIAL_CHARS_PATTERN: &str = r#"\\(\+|\^|`|:|\{|\}|"|\[|\]|\(|\)|\~|!|\\|\*|\s)"#;
const ESCAPED_SPECIAL_CHARS_PATTERN: &str = r#"\\(\+|\^|`|:|\{|\}|"|\[|\]|\(|\)|!|\\|\*|\s)"#;

/// Parses a field_name
/// A field name must have at least one character and be followed by a colon.
Expand Down Expand Up @@ -120,22 +120,36 @@ fn date_time<'a>() -> impl Parser<&'a str, Output = String> {

fn term_val<'a>() -> impl Parser<&'a str, Output = String> {
let phrase = char('"').with(many1(satisfy(|c| c != '"'))).skip(char('"'));
phrase.or(word())
negative_number().or(phrase.or(word()))
}

fn term_query<'a>() -> impl Parser<&'a str, Output = UserInputLiteral> {
let term_val_with_field = negative_number().or(term_val());
(field_name(), term_val_with_field).map(|(field_name, phrase)| UserInputLiteral {
(field_name(), term_val(), slop_val()).map(|(field_name, phrase, slop)| UserInputLiteral {
field_name: Some(field_name),
phrase,
slop,
})
}

fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> {
let slop =
(char('~'), many1(digit())).and_then(|(_, slop): (_, String)| match slop.parse::<u32>() {
Ok(d) => Ok(d),
_ => Err(StringStreamError::UnexpectedParse),
});
optional(slop).map(|slop| match slop {
Some(d) => d,
_ => 0,
})
}

fn literal<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
let term_default_field = term_val().map(|phrase| UserInputLiteral {
let term_default_field = (term_val(), slop_val()).map(|(phrase, slop)| UserInputLiteral {
field_name: None,
phrase,
slop,
});

attempt(term_query())
.or(term_default_field)
.map(UserInputLeaf::from)
Expand Down Expand Up @@ -522,18 +536,10 @@ mod test {
super::field_name().parse(".my.field.name:a"),
Ok((".my.field.name".to_string(), "a"))
);
assert_eq!(
super::field_name().parse(r#"my\ field:a"#),
Ok(("my field".to_string(), "a"))
);
assert_eq!(
super::field_name().parse(r#"にんじん:a"#),
Ok(("にんじん".to_string(), "a"))
);
assert_eq!(
super::field_name().parse("my\\ field\\ name:a"),
Ok(("my field name".to_string(), "a"))
);
assert_eq!(
super::field_name().parse(r#"my\field:a"#),
Ok((r#"my\field"#.to_string(), "a"))
Expand Down Expand Up @@ -562,6 +568,17 @@ mod test {
super::field_name().parse("_my_field:a"),
Ok(("_my_field".to_string(), "a"))
);
assert_eq!(
super::field_name().parse("~my~field:a"),
Ok(("~my~field".to_string(), "a"))
);
for special_char in SPECIAL_CHARS.iter() {
let query = &format!("\\{special_char}my\\{special_char}field:a");
assert_eq!(
super::field_name().parse(&query),
Ok((format!("{special_char}my{special_char}field"), "a"))
);
}
}

#[test]
Expand Down Expand Up @@ -714,4 +731,22 @@ mod test {
);
test_is_parse_err("abc + ");
}

#[test]
fn test_slop() {
assert!(parse_to_ast().parse("\"a b\"~").is_err());
assert!(parse_to_ast().parse("foo:\"a b\"~").is_err());
assert!(parse_to_ast().parse("\"a b\"~a").is_err());
assert!(parse_to_ast().parse("\"a b\"~100000000000000000").is_err());

test_parse_query_to_ast_helper("\"a b\"^2~4", "(*(\"a b\")^2 *\"~4\")");
test_parse_query_to_ast_helper("\"~Document\"", "\"~Document\"");
test_parse_query_to_ast_helper("~Document", "\"~Document\"");
test_parse_query_to_ast_helper("a~2", "\"a~2\"");
test_parse_query_to_ast_helper("\"a b\"~0", "\"a b\"");
test_parse_query_to_ast_helper("\"a b\"~1", "\"a b\"~1");
test_parse_query_to_ast_helper("\"a b\"~3", "\"a b\"~3");
test_parse_query_to_ast_helper("foo:\"a b\"~300", "\"foo\":\"a b\"~300");
test_parse_query_to_ast_helper("\"a b\"~300^2", "(\"a b\"~300)^2");
}
}
11 changes: 8 additions & 3 deletions query-grammar/src/user_input_ast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,19 @@ impl Debug for UserInputLeaf {
pub struct UserInputLiteral {
pub field_name: Option<String>,
pub phrase: String,
pub slop: u32,
}

impl fmt::Debug for UserInputLiteral {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
match self.field_name {
Some(ref field_name) => write!(formatter, "\"{}\":\"{}\"", field_name, self.phrase),
None => write!(formatter, "\"{}\"", self.phrase),
if let Some(ref field) = self.field_name {
write!(formatter, "\"{}\":", field)?;
}
write!(formatter, "\"{}\"", self.phrase)?;
if self.slop > 0 {
write!(formatter, "~{}", self.slop)?;
}
Ok(())
}
}

Expand Down
9 changes: 7 additions & 2 deletions src/query/phrase_query/phrase_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,12 @@ impl PhraseQuery {
/// Creates a new `PhraseQuery` given a list of terms and their offsets.
///
/// Can be used to provide custom offset for each term.
pub fn new_with_offset(mut terms: Vec<(usize, Term)>) -> PhraseQuery {
pub fn new_with_offset(terms: Vec<(usize, Term)>) -> PhraseQuery {
PhraseQuery::new_with_offset_and_slop(terms, 0)
}

/// Creates a new `PhraseQuery` given a list of terms, their offsets and a slop
pub fn new_with_offset_and_slop(mut terms: Vec<(usize, Term)>, slop: u32) -> PhraseQuery {
assert!(
terms.len() > 1,
"A phrase query is required to have strictly more than one term."
Expand All @@ -57,7 +62,7 @@ impl PhraseQuery {
PhraseQuery {
field,
phrase_terms: terms,
slop: 0,
slop,
}
}

Expand Down
11 changes: 9 additions & 2 deletions src/query/query_parser/logical_ast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use crate::Score;
#[derive(Clone)]
pub enum LogicalLiteral {
Term(Term),
Phrase(Vec<(usize, Term)>),
Phrase(Vec<(usize, Term)>, u32),
Range {
field: Field,
value_type: Type,
Expand Down Expand Up @@ -74,7 +74,14 @@ impl fmt::Debug for LogicalLiteral {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
match *self {
LogicalLiteral::Term(ref term) => write!(formatter, "{:?}", term),
LogicalLiteral::Phrase(ref terms) => write!(formatter, "\"{:?}\"", terms),
LogicalLiteral::Phrase(ref terms, slop) => {
write!(formatter, "\"{:?}\"", terms)?;
if slop > 0 {
write!(formatter, "~{:?}", slop)
} else {
Ok(())
}
}
LogicalLiteral::Range {
ref lower,
ref upper,
Expand Down
39 changes: 33 additions & 6 deletions src/query/query_parser/query_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,9 @@ fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
/// It is also possible to define a boost for a some specific field, at the query parser level.
/// (See [`set_boost(...)`](#method.set_field_boost) ). Typically you may want to boost a title
/// field.
///
/// Phrase terms support the `~` slop operator which allows to set the phrase's matching
/// distance in words. `"big wolf"~1` will return documents containing the phrase `"big bad wolf"`.
#[derive(Clone)]
pub struct QueryParser {
schema: Schema,
Expand Down Expand Up @@ -405,6 +408,7 @@ impl QueryParser {
field: Field,
json_path: &str,
phrase: &str,
slop: u32,
) -> Result<Vec<LogicalLiteral>, QueryParserError> {
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
Expand Down Expand Up @@ -461,6 +465,7 @@ impl QueryParser {
field_name,
field,
phrase,
slop,
&text_analyzer,
index_record_option,
)?
Expand Down Expand Up @@ -626,7 +631,9 @@ impl QueryParser {
self.compute_path_triplets_for_literal(&literal)?;
let mut asts: Vec<LogicalAst> = Vec::new();
for (field, json_path, phrase) in term_phrases {
for ast in self.compute_logical_ast_for_leaf(field, json_path, phrase)? {
for ast in
self.compute_logical_ast_for_leaf(field, json_path, phrase, literal.slop)?
{
// Apply some field specific boost defined at the query parser level.
let boost = self.field_boost(field);
asts.push(LogicalAst::Leaf(Box::new(ast)).boost(boost));
Expand Down Expand Up @@ -670,9 +677,9 @@ impl QueryParser {
fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box<dyn Query> {
match logical_literal {
LogicalLiteral::Term(term) => Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs)),
LogicalLiteral::Phrase(term_with_offsets) => {
Box::new(PhraseQuery::new_with_offset(term_with_offsets))
}
LogicalLiteral::Phrase(term_with_offsets, slop) => Box::new(
PhraseQuery::new_with_offset_and_slop(term_with_offsets, slop),
),
LogicalLiteral::Range {
field,
value_type,
Expand All @@ -689,6 +696,7 @@ fn generate_literals_for_str(
field_name: &str,
field: Field,
phrase: &str,
slop: u32,
text_analyzer: &TextAnalyzer,
index_record_option: IndexRecordOption,
) -> Result<Option<LogicalLiteral>, QueryParserError> {
Expand All @@ -710,7 +718,7 @@ fn generate_literals_for_str(
field_name.to_string(),
));
}
Ok(Some(LogicalLiteral::Phrase(terms)))
Ok(Some(LogicalLiteral::Phrase(terms, slop)))
}

fn generate_literals_for_json_object(
Expand Down Expand Up @@ -741,7 +749,7 @@ fn generate_literals_for_json_object(
field_name.to_string(),
));
}
logical_literals.push(LogicalLiteral::Phrase(terms));
logical_literals.push(LogicalLiteral::Phrase(terms, 0));
Ok(logical_literals)
}

Expand Down Expand Up @@ -1493,4 +1501,23 @@ mod test {
assert_eq!(&super::locate_splitting_dots(r#"a\.b.c"#), &[4]);
assert_eq!(&super::locate_splitting_dots(r#"a\..b.c"#), &[3, 5]);
}

#[test]
pub fn test_phrase_slop() {
test_parse_query_to_logical_ast_helper(
"\"a b\"~0",
r#"("[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]" "[(0, Term(type=Str, field=1, "a")), (1, Term(type=Str, field=1, "b"))]")"#,
false,
);
test_parse_query_to_logical_ast_helper(
"\"a b\"~2",
r#"("[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]"~2 "[(0, Term(type=Str, field=1, "a")), (1, Term(type=Str, field=1, "b"))]"~2)"#,
false,
);
test_parse_query_to_logical_ast_helper(
"title:\"a b~4\"~2",
r#""[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b")), (2, Term(type=Str, field=0, "4"))]"~2"#,
false,
);
}
}

0 comments on commit 437cd35

Please sign in to comment.