Skip to content

Commit

Permalink
implement lenient parser (#2129)
Browse files Browse the repository at this point in the history
* move query parser to nom

* add suupport for term grouping

* initial work on infallible parser

* fmt

* add tests and fix minor parsing bugs

* address review comments

* add support for lenient queries in tantivy

* make lenient parser report errors

* allow mixing occur and bool in query
  • Loading branch information
trinity-1686a authored Aug 8, 2023
1 parent c2be660 commit b92082b
Show file tree
Hide file tree
Showing 8 changed files with 1,662 additions and 509 deletions.
4 changes: 1 addition & 3 deletions query-grammar/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,4 @@ keywords = ["search", "information", "retrieval"]
edition = "2021"

[dependencies]
combine = {version="4", default-features=false, features=[] }
once_cell = "1.7.2"
regex ={ version = "1.5.4", default-features = false, features = ["std", "unicode"] }
nom = "7"
353 changes: 353 additions & 0 deletions query-grammar/src/infallible.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,353 @@
//! nom combinators for infallible operations
use std::convert::Infallible;

use nom::{AsChar, IResult, InputLength, InputTakeAtPosition};

pub(crate) type ErrorList = Vec<LenientErrorInternal>;
pub(crate) type JResult<I, O> = IResult<I, (O, ErrorList), Infallible>;

/// An error, with an end-of-string based offset
#[derive(Debug)]
pub(crate) struct LenientErrorInternal {
pub pos: usize,
pub message: String,
}

/// A recoverable error and the position it happened at
#[derive(Debug, PartialEq)]
pub struct LenientError {
pub pos: usize,
pub message: String,
}

impl LenientError {
pub(crate) fn from_internal(internal: LenientErrorInternal, str_len: usize) -> LenientError {
LenientError {
pos: str_len - internal.pos,
message: internal.message,
}
}
}

fn unwrap_infallible<T>(res: Result<T, nom::Err<Infallible>>) -> T {
match res {
Ok(val) => val,
Err(_) => unreachable!(),
}
}

// when rfcs#1733 get stabilized, this can make things clearer
// trait InfallibleParser<I, O> = nom::Parser<I, (O, ErrorList), std::convert::Infallible>;

/// A variant of the classical `opt` parser, except it returns an infallible error type.
///
/// It's less generic than the original to ease type resolution in the rest of the code.
pub(crate) fn opt_i<I: Clone, O, F>(mut f: F) -> impl FnMut(I) -> JResult<I, Option<O>>
where F: nom::Parser<I, O, nom::error::Error<I>> {
move |input: I| {
let i = input.clone();
match f.parse(input) {
Ok((i, o)) => Ok((i, (Some(o), Vec::new()))),
Err(_) => Ok((i, (None, Vec::new()))),
}
}
}

pub(crate) fn opt_i_err<'a, I: Clone + InputLength, O, F>(
mut f: F,
message: impl ToString + 'a,
) -> impl FnMut(I) -> JResult<I, Option<O>> + 'a
where
F: nom::Parser<I, O, nom::error::Error<I>> + 'a,
{
move |input: I| {
let i = input.clone();
match f.parse(input) {
Ok((i, o)) => Ok((i, (Some(o), Vec::new()))),
Err(_) => {
let errs = vec![LenientErrorInternal {
pos: i.input_len(),
message: message.to_string(),
}];
Ok((i, (None, errs)))
}
}
}
}

pub(crate) fn space0_infallible<T>(input: T) -> JResult<T, T>
where
T: InputTakeAtPosition + Clone,
<T as InputTakeAtPosition>::Item: AsChar + Clone,
{
opt_i(nom::character::complete::space0)(input)
.map(|(left, (spaces, errors))| (left, (spaces.expect("space0 can't fail"), errors)))
}

pub(crate) fn space1_infallible<T>(input: T) -> JResult<T, Option<T>>
where
T: InputTakeAtPosition + Clone + InputLength,
<T as InputTakeAtPosition>::Item: AsChar + Clone,
{
opt_i(nom::character::complete::space1)(input).map(|(left, (spaces, mut errors))| {
if spaces.is_none() {
errors.push(LenientErrorInternal {
pos: left.input_len(),
message: "missing space".to_string(),
})
}
(left, (spaces, errors))
})
}

pub(crate) fn fallible<I, O, E: nom::error::ParseError<I>, F>(
mut f: F,
) -> impl FnMut(I) -> IResult<I, O, E>
where F: nom::Parser<I, (O, ErrorList), Infallible> {
use nom::Err;
move |input: I| match f.parse(input) {
Ok((input, (output, _err))) => Ok((input, output)),
Err(Err::Incomplete(needed)) => Err(Err::Incomplete(needed)),
Err(Err::Error(val)) | Err(Err::Failure(val)) => match val {},
}
}

pub(crate) fn delimited_infallible<I, O1, O2, O3, F, G, H>(
mut first: F,
mut second: G,
mut third: H,
) -> impl FnMut(I) -> JResult<I, O2>
where
F: nom::Parser<I, (O1, ErrorList), Infallible>,
G: nom::Parser<I, (O2, ErrorList), Infallible>,
H: nom::Parser<I, (O3, ErrorList), Infallible>,
{
move |input: I| {
let (input, (_, mut err)) = first.parse(input)?;
let (input, (o2, mut err2)) = second.parse(input)?;
err.append(&mut err2);
let (input, (_, mut err3)) = third.parse(input)?;
err.append(&mut err3);
Ok((input, (o2, err)))
}
}

// Parse nothing. Just a lazy way to not implement terminated/preceded and use delimited instead
pub(crate) fn nothing(i: &str) -> JResult<&str, ()> {
Ok((i, ((), Vec::new())))
}

pub(crate) trait TupleInfallible<I, O> {
/// Parses the input and returns a tuple of results of each parser.
fn parse(&mut self, input: I) -> JResult<I, O>;
}

impl<Input, Output, F: nom::Parser<Input, (Output, ErrorList), Infallible>>
TupleInfallible<Input, (Output,)> for (F,)
{
fn parse(&mut self, input: Input) -> JResult<Input, (Output,)> {
self.0.parse(input).map(|(i, (o, e))| (i, ((o,), e)))
}
}

// these macros are heavily copied from nom, with some minor adaptations for our type
macro_rules! tuple_trait(
($name1:ident $ty1:ident, $name2: ident $ty2:ident, $($name:ident $ty:ident),*) => (
tuple_trait!(__impl $name1 $ty1, $name2 $ty2; $($name $ty),*);
);
(__impl $($name:ident $ty: ident),+; $name1:ident $ty1:ident, $($name2:ident $ty2:ident),*) => (
tuple_trait_impl!($($name $ty),+);
tuple_trait!(__impl $($name $ty),+ , $name1 $ty1; $($name2 $ty2),*);
);
(__impl $($name:ident $ty: ident),+; $name1:ident $ty1:ident) => (
tuple_trait_impl!($($name $ty),+);
tuple_trait_impl!($($name $ty),+, $name1 $ty1);
);
);

macro_rules! tuple_trait_impl(
($($name:ident $ty: ident),+) => (
impl<
Input: Clone, $($ty),+ ,
$($name: nom::Parser<Input, ($ty, ErrorList), Infallible>),+
> TupleInfallible<Input, ( $($ty),+ )> for ( $($name),+ ) {

fn parse(&mut self, input: Input) -> JResult<Input, ( $($ty),+ )> {
let mut error_list = Vec::new();
tuple_trait_inner!(0, self, input, (), error_list, $($name)+)
}
}
);
);

macro_rules! tuple_trait_inner(
($it:tt, $self:expr, $input:expr, (), $error_list:expr, $head:ident $($id:ident)+) => ({
let (i, (o, mut err)) = $self.$it.parse($input.clone())?;
$error_list.append(&mut err);

succ!($it, tuple_trait_inner!($self, i, ( o ), $error_list, $($id)+))
});
($it:tt, $self:expr, $input:expr, ($($parsed:tt)*), $error_list:expr, $head:ident $($id:ident)+) => ({
let (i, (o, mut err)) = $self.$it.parse($input.clone())?;
$error_list.append(&mut err);

succ!($it, tuple_trait_inner!($self, i, ($($parsed)* , o), $error_list, $($id)+))
});
($it:tt, $self:expr, $input:expr, ($($parsed:tt)*), $error_list:expr, $head:ident) => ({
let (i, (o, mut err)) = $self.$it.parse($input.clone())?;
$error_list.append(&mut err);

Ok((i, (($($parsed)* , o), $error_list)))
});
);

macro_rules! succ (
(0, $submac:ident ! ($($rest:tt)*)) => ($submac!(1, $($rest)*));
(1, $submac:ident ! ($($rest:tt)*)) => ($submac!(2, $($rest)*));
(2, $submac:ident ! ($($rest:tt)*)) => ($submac!(3, $($rest)*));
(3, $submac:ident ! ($($rest:tt)*)) => ($submac!(4, $($rest)*));
(4, $submac:ident ! ($($rest:tt)*)) => ($submac!(5, $($rest)*));
(5, $submac:ident ! ($($rest:tt)*)) => ($submac!(6, $($rest)*));
(6, $submac:ident ! ($($rest:tt)*)) => ($submac!(7, $($rest)*));
(7, $submac:ident ! ($($rest:tt)*)) => ($submac!(8, $($rest)*));
(8, $submac:ident ! ($($rest:tt)*)) => ($submac!(9, $($rest)*));
(9, $submac:ident ! ($($rest:tt)*)) => ($submac!(10, $($rest)*));
(10, $submac:ident ! ($($rest:tt)*)) => ($submac!(11, $($rest)*));
(11, $submac:ident ! ($($rest:tt)*)) => ($submac!(12, $($rest)*));
(12, $submac:ident ! ($($rest:tt)*)) => ($submac!(13, $($rest)*));
(13, $submac:ident ! ($($rest:tt)*)) => ($submac!(14, $($rest)*));
(14, $submac:ident ! ($($rest:tt)*)) => ($submac!(15, $($rest)*));
(15, $submac:ident ! ($($rest:tt)*)) => ($submac!(16, $($rest)*));
(16, $submac:ident ! ($($rest:tt)*)) => ($submac!(17, $($rest)*));
(17, $submac:ident ! ($($rest:tt)*)) => ($submac!(18, $($rest)*));
(18, $submac:ident ! ($($rest:tt)*)) => ($submac!(19, $($rest)*));
(19, $submac:ident ! ($($rest:tt)*)) => ($submac!(20, $($rest)*));
(20, $submac:ident ! ($($rest:tt)*)) => ($submac!(21, $($rest)*));
);

tuple_trait!(FnA A, FnB B, FnC C, FnD D, FnE E, FnF F, FnG G, FnH H, FnI I, FnJ J, FnK K, FnL L,
FnM M, FnN N, FnO O, FnP P, FnQ Q, FnR R, FnS S, FnT T, FnU U);

// Special case: implement `TupleInfallible` for `()`, the unit type.
// This can come up in macros which accept a variable number of arguments.
// Literally, `()` is an empty tuple, so it should simply parse nothing.
impl<I> TupleInfallible<I, ()> for () {
fn parse(&mut self, input: I) -> JResult<I, ()> {
Ok((input, ((), Vec::new())))
}
}

pub(crate) fn tuple_infallible<I, O, List: TupleInfallible<I, O>>(
mut l: List,
) -> impl FnMut(I) -> JResult<I, O> {
move |i: I| l.parse(i)
}

pub(crate) fn separated_list_infallible<I, O, O2, F, G>(
mut sep: G,
mut f: F,
) -> impl FnMut(I) -> JResult<I, Vec<O>>
where
I: Clone + InputLength,
F: nom::Parser<I, (O, ErrorList), Infallible>,
G: nom::Parser<I, (O2, ErrorList), Infallible>,
{
move |i: I| {
let mut res: Vec<O> = Vec::new();
let mut errors: ErrorList = Vec::new();

let (mut i, (o, mut err)) = unwrap_infallible(f.parse(i.clone()));
errors.append(&mut err);
res.push(o);

loop {
let (i_sep_parsed, (_, mut err_sep)) = unwrap_infallible(sep.parse(i.clone()));
let len_before = i_sep_parsed.input_len();

let (i_elem_parsed, (o, mut err_elem)) =
unwrap_infallible(f.parse(i_sep_parsed.clone()));

// infinite loop check: the parser must always consume
// if we consumed nothing here, don't produce an element.
if i_elem_parsed.input_len() == len_before {
return Ok((i, (res, errors)));
}
res.push(o);
errors.append(&mut err_sep);
errors.append(&mut err_elem);
i = i_elem_parsed;
}
}
}

pub(crate) trait Alt<I, O> {
/// Tests each parser in the tuple and returns the result of the first one that succeeds
fn choice(&mut self, input: I) -> Option<JResult<I, O>>;
}

macro_rules! alt_trait(
($first_cond:ident $first:ident, $($id_cond:ident $id: ident),+) => (
alt_trait!(__impl $first_cond $first; $($id_cond $id),+);
);
(__impl $($current_cond:ident $current:ident),*; $head_cond:ident $head:ident, $($id_cond:ident $id:ident),+) => (
alt_trait_impl!($($current_cond $current),*);

alt_trait!(__impl $($current_cond $current,)* $head_cond $head; $($id_cond $id),+);
);
(__impl $($current_cond:ident $current:ident),*; $head_cond:ident $head:ident) => (
alt_trait_impl!($($current_cond $current),*);
alt_trait_impl!($($current_cond $current,)* $head_cond $head);
);
);

macro_rules! alt_trait_impl(
($($id_cond:ident $id:ident),+) => (
impl<
Input: Clone, Output,
$(
// () are to make things easier on me, but I'm not entirely sure whether we can do better
// with rule E0207
$id_cond: nom::Parser<Input, (), ()>,
$id: nom::Parser<Input, (Output, ErrorList), Infallible>
),+
> Alt<Input, Output> for ( $(($id_cond, $id),)+ ) {

fn choice(&mut self, input: Input) -> Option<JResult<Input, Output>> {
match self.0.0.parse(input.clone()) {
Err(_) => alt_trait_inner!(1, self, input, $($id_cond $id),+),
Ok((input_left, _)) => Some(self.0.1.parse(input_left)),
}
}
}
);
);

macro_rules! alt_trait_inner(
($it:tt, $self:expr, $input:expr, $head_cond:ident $head:ident, $($id_cond:ident $id:ident),+) => (
match $self.$it.0.parse($input.clone()) {
Err(_) => succ!($it, alt_trait_inner!($self, $input, $($id_cond $id),+)),
Ok((input_left, _)) => Some($self.$it.1.parse(input_left)),
}
);
($it:tt, $self:expr, $input:expr, $head_cond:ident $head:ident) => (
None
);
);

alt_trait!(A1 A, B1 B, C1 C, D1 D, E1 E, F1 F, G1 G, H1 H, I1 I, J1 J, K1 K,
L1 L, M1 M, N1 N, O1 O, P1 P, Q1 Q, R1 R, S1 S, T1 T, U1 U);

/// An alt() like combinator. For each branch, it first tries a fallible parser, which commits to
/// this branch, or tells to check next branch, and the execute the infallible parser which follow.
///
/// In case no branch match, the default (fallible) parser is executed.
pub(crate) fn alt_infallible<I: Clone, O, F, List: Alt<I, O>>(
mut l: List,
mut default: F,
) -> impl FnMut(I) -> JResult<I, O>
where
F: nom::Parser<I, (O, ErrorList), Infallible>,
{
move |i: I| l.choice(i.clone()).unwrap_or_else(|| default.parse(i))
}
13 changes: 10 additions & 3 deletions query-grammar/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,19 +1,26 @@
#![allow(clippy::derive_partial_eq_without_eq)]

mod infallible;
mod occur;
mod query_grammar;
mod user_input_ast;
use combine::parser::Parser;

pub use crate::infallible::LenientError;
pub use crate::occur::Occur;
use crate::query_grammar::parse_to_ast;
use crate::query_grammar::{parse_to_ast, parse_to_ast_lenient};
pub use crate::user_input_ast::{
Delimiter, UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral,
};

pub struct Error;

/// Parse a query
pub fn parse_query(query: &str) -> Result<UserInputAst, Error> {
let (user_input_ast, _remaining) = parse_to_ast().parse(query).map_err(|_| Error)?;
let (_remaining, user_input_ast) = parse_to_ast(query).map_err(|_| Error)?;
Ok(user_input_ast)
}

/// Parse a query, trying to recover from syntax errors, and giving hints toward fixing errors.
pub fn parse_query_lenient(query: &str) -> (UserInputAst, Vec<LenientError>) {
parse_to_ast_lenient(query)
}
Loading

0 comments on commit b92082b

Please sign in to comment.