Skip to content

Commit

Permalink
Adds the scanner rules for punctuation (#30)
Browse files Browse the repository at this point in the history
Adds the Pest grammar rules for these tokens and particularly
adds the lookahead assertions for the various edge cases around the
punctuation tokens that interact with each other, comments, and decimals.

Adds special `Content` enum variants for `.`/`*`/`?` and a basic variant
for `Operator` and `Delimiter`.

This should the final pre-requisite for all of the terminal parse rules in
the PEG and allow us to start adding the parser rules for expressions.

An explicit TODO is around modeling the various operators as their own
enum or enum variants, right now they are returned as normalized string
content.
  • Loading branch information
almann authored Jun 3, 2021
1 parent d309746 commit d257e3e
Show file tree
Hide file tree
Showing 2 changed files with 208 additions and 7 deletions.
86 changes: 84 additions & 2 deletions partiql-parser/src/partiql.pest
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,88 @@ Token = _{
| String
| Number
| Identifier
| Punctuation
}

//
// Punctuation
//

Punctuation = _{
Dot_
| Star_
| Parameter
| Operator
| Delimiter
}

// pathing operator and has some special meaning beyond a normal operator
// (e.g. wildcard paths)
Dot_ = { "." }

// multiplication operator and wildcard
Star_ = { "*" }

// parameter variable
Parameter = @{ QuestionMark_ }
QuestionMark_ = { "?" }

// punctuation that are operators--does not count keyword operators
Operator = @{
LexicalScope_
| Plus_
| Minus_
| Divide_
| Modulus_
| Less_
| LessEq_
| Greater_
| GreaterEq_
| Eq_
| NotEq_
| Concat_
}

LexicalScope_ = { "@" }
Plus_ = { "+" }
Minus_ = @{ "-" ~ !"-" }
Divide_ = @{ "/" ~ !"*" }
Modulus_ = { "%" }
Less_ = @{ "<" ~ !("<" | "=" | ">") }
LessEq_ = { "<=" }
Greater_ = @{ ">" ~ !(">" | "=") }
GreaterEq_ = { ">=" }
Eq_ = { "=" }
NotEq_ = { ("<>" | "!=") }
Concat_ = { "||" }

// punctuation that delimit things in the grammar
Delimiter = @ {
Comma_
| Colon_
| SemiColon_
| LeftParen_
| RightParen_
| LeftBracket_
| RightBracket_
| LeftCurly_
| RightCurly_
| LeftDoubleAngle_
| RightDoubleAngle_
}

Comma_ = { "," }
Colon_ = { ":" }
SemiColon_ = { ";" }
LeftParen_ = { "(" }
RightParen_ = { ")" }
LeftBracket_ = { "[" }
RightBracket_ = { "]" }
LeftCurly_ = { "{" }
RightCurly_ = { "}" }
LeftDoubleAngle_ = { "<<" }
RightDoubleAngle_ = { ">>" }

//
// Numeric Literals
//
Expand All @@ -37,12 +117,14 @@ DecimalExp = {
Decimal ~ ("e" | "E") ~ Integer
}

// XXX `.nnn` and `nnn.` are okay, but `.` on its own definitely is not
Decimal = {
Integer? ~ "." ~ Fraction
Integer? ~ "." ~ Fraction
| Integer ~ "."
}

Fraction = {
Digit*
Digit+
}

// XXX this explicitly supports arbitrary zero prefixing in various places
Expand Down
129 changes: 124 additions & 5 deletions partiql-parser/src/scanner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ use pest::iterators::Pair;
use pest::{Parser, RuleType};
use std::borrow::Cow;

// TODO turn operator/delimiter into enums of their own (nested or otherwise)

/// The parsed content associated with a [`Token`] that has been scanned.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum Content<'val> {
Expand All @@ -36,7 +38,22 @@ pub enum Content<'val> {

/// A string literal. Contains the slice for the content of the literal.
StringLiteral(Cow<'val, str>),
// TODO things like literals, punctuation, etc.

/// The `.` punctuation
Dot,

/// The `*` operator and wildcard.
Star,

/// The `?` placeholder for a query parameter.
Parameter,

/// An operator represented by punctuation (as opposed to a keyword based operator).
/// Contains the slice for the operator.
Operator(Cow<'val, str>),

/// A separator character. Contains the slice for the delimiter character.
Delimiter(Cow<'val, str>),
}

/// Convenience constructor for a [`Content::Keyword`].
Expand Down Expand Up @@ -64,6 +81,16 @@ pub fn string_literal<'val, S: Into<Cow<'val, str>>>(text: S) -> Content<'val> {
Content::StringLiteral(text.into())
}

/// Convenience constructor for a [`Content::Operator`].
pub fn operator<'val, S: Into<Cow<'val, str>>>(text: S) -> Content<'val> {
Content::Operator(text.into())
}

/// Convenience constructor for a [`Content::Operator`].
pub fn delimiter<'val, S: Into<Cow<'val, str>>>(text: S) -> Content<'val> {
Content::Delimiter(text.into())
}

/// Internal type to keep track of remaining input and relative line/column information.
///
/// This is used to leverage the PEG to do continuation parsing and calculating the line/offset
Expand Down Expand Up @@ -174,6 +201,14 @@ where
}
}

fn normalize_operator(raw_text: &str) -> Cow<str> {
match raw_text {
"!=" => "<>",
_ => raw_text,
}
.into()
}

impl<'val> PartiQLScanner<'val> {
fn do_next_token(&mut self) -> ParserResult<Token<'val>> {
// the scanner rule is expected to return a single node
Expand All @@ -186,14 +221,14 @@ impl<'val> PartiQLScanner<'val> {
self.remainder = self.remainder.consume(start_off + text.len(), pair.end()?);

let content = match pair.as_rule() {
Rule::Keyword => Content::Keyword(text.to_uppercase().into()),
Rule::String => Content::StringLiteral(normalize_string_lit(pair.as_str())),
Rule::Keyword => keyword(text.to_uppercase()),
Rule::String => string_literal(normalize_string_lit(pair.as_str())),
Rule::Identifier => {
let ident_pair = pair.into_inner().exactly_one()?;
match ident_pair.as_rule() {
Rule::NonQuotedIdentifier => Content::Identifier(ident_pair.as_str().into()),
Rule::NonQuotedIdentifier => identifier(ident_pair.as_str()),
Rule::QuotedIdentifier => {
Content::Identifier(normalize_quoted_ident(ident_pair.as_str()))
identifier(normalize_quoted_ident(ident_pair.as_str()))
}
_ => return ident_pair.unexpected(),
}
Expand All @@ -208,6 +243,11 @@ impl<'val> PartiQLScanner<'val> {
_ => return number_pair.unexpected(),
}
}
Rule::Dot_ => Content::Dot,
Rule::Star_ => Content::Star,
Rule::Parameter => Content::Parameter,
Rule::Operator => operator(normalize_operator(text)),
Rule::Delimiter => delimiter(text),
_ => return pair.unexpected(),
};

Expand Down Expand Up @@ -533,6 +573,85 @@ mod test {
"0.0e000" => decimal_literal_from_str("0.0")
]
)]
#[case::no_trailing_zeros(scanner_test_case!["1231231." => decimal_literal_from_str("1231231")])]
#[case::delimiters(
scanner_test_case![
"[" => delimiter("["),
"]" => delimiter("]"),
"(" => delimiter("("),
")" => delimiter(")"),
"{" => delimiter("{"),
"}" => delimiter("}"),
"<<" => delimiter("<<"),
">>" => delimiter(">>"),
"," => delimiter(","),
":" => delimiter(":"),
";" => delimiter(";"),
]
)]
#[case::operators(
scanner_test_case![
"@" => operator("@"),
"+" => operator("+"),
"-" => operator("-"),
"/" => operator("/"),
"%" => operator("%"),
"<" => operator("<"),
" ",
"<=" => operator("<="),
">" => operator(">"),
" ",
">=" => operator(">="),
"=" => operator("="),
"<>" => operator("<>"),
"!=" => operator("<>"),
]
)]
#[case::left_angles(
scanner_test_case![
"<<" => delimiter("<<"),
"<<" => delimiter("<<"),
"<" => operator("<"),
]
)]
#[case::right_angles(
scanner_test_case![
">>" => delimiter(">>"),
">>" => delimiter(">>"),
">" => operator(">"),
]
)]
#[case::balanced_angles(
scanner_test_case![
"<<" => delimiter("<<"),
"<<" => delimiter("<<"),
"<>" => operator("<>"),
">>" => delimiter(">>"),
">>" => delimiter(">>"),
" ",
"<<" => delimiter("<<"),
"<=" => operator("<="),
">>" => delimiter(">>"),
">" => operator(">"),
]
)]
#[case::dot(scanner_test_case!["." => Content::Dot])]
#[case::star(scanner_test_case!["*" => Content::Star])]
#[case::parameter(scanner_test_case!["?" => Content::Parameter])]
#[case::comment_no_minus(
scanner_test_case![
"-------- a line comment with no minus...\n"
]
)]
#[case::divide_block_comment(
scanner_test_case![
"/" => operator("/"),
"/" => operator("/"),
"/**/",
"/" => operator("/"),
"/" => operator("/"),
]
)]
#[case::select_from(
scanner_test_case![
"SelEct" => keyword("SELECT"),
Expand Down

0 comments on commit d257e3e

Please sign in to comment.