From 0aa29ad27f0847b9c32ec3a36653577ca6dbd70a Mon Sep 17 00:00:00 2001 From: Igor Matuszewski Date: Thu, 20 Jun 2024 16:48:46 +0200 Subject: [PATCH] refactor: Use DSL v2 types in place of ScannerDefinitionNode (#1003) Based on #1002 Part of #638 The DSL v2's `model::Scanner` is almost 1:1 usable with our current parser codegen, with the exception of versioned scanners. In short we: - introduce a helper type `VersionedScanner` that is used in place of the old `ScannerDefinitionNode::Versioned` - introduce a `ScannerExt` trait implemented for both `VersionedScanner` and the `model::Scanner`, which is responsible for generating the main scanning logic - Repurposes `ScannerDefinition` slightly to surface more of the scanner-related logic for trivia/fragment/token - implements this directly for `model::{TriviaItem,FragmentItem,TokenItem}` and stores it in the v1 Grammar struct - similarly, uses `model::KeywordItem` directly in place of v1's `KeywordScannerDefinition` as they shared the same functionality --- .../definition/src/model/terminals/keyword.rs | 19 +- .../runtime/generator/src/parser/codegen.rs | 3 +- .../codegen/keyword_scanner_definition.rs | 62 ++++- .../src/parser/codegen/parser_definition.rs | 2 +- .../src/parser/codegen/scanner_definition.rs | 246 ++++++++++++------ .../generator/src/parser/codegen/trie.rs | 13 +- .../generator/src/parser/codegen/versioned.rs | 18 +- .../runtime/generator/src/parser/grammar.rs | 7 +- .../src/parser/grammar/constructor.rs | 143 +--------- .../src/parser/grammar/parser_definition.rs | 6 +- .../src/parser/grammar/scanner_definition.rs | 143 +--------- .../generator/src/parser/grammar/visitor.rs | 7 +- .../runtime/generator/src/parser/mod.rs | 23 +- 13 files changed, 290 insertions(+), 402 deletions(-) diff --git a/crates/codegen/language/definition/src/model/terminals/keyword.rs b/crates/codegen/language/definition/src/model/terminals/keyword.rs index e8439b1142..eeff982fc1 100644 --- a/crates/codegen/language/definition/src/model/terminals/keyword.rs +++ b/crates/codegen/language/definition/src/model/terminals/keyword.rs @@ -2,7 +2,7 @@ use codegen_language_internal_macros::{derive_spanned_type, ParseInputTokens, Wr use itertools::Itertools; use serde::{Deserialize, Serialize}; -use crate::model::{Identifier, VersionSpecifier}; +use crate::model::{Identifier, Scanner, VersionSpecifier}; #[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] #[derive_spanned_type(Clone, Debug, ParseInputTokens, WriteOutputTokens)] @@ -32,6 +32,23 @@ pub enum KeywordValue { Atom { atom: String }, } +impl From for Scanner { + fn from(value: KeywordValue) -> Scanner { + match value { + KeywordValue::Optional { value } => Scanner::Optional { + scanner: Box::new((*value).into()), + }, + KeywordValue::Sequence { values } => Scanner::Sequence { + scanners: values.into_iter().map(Into::into).collect(), + }, + KeywordValue::Atom { atom } => Scanner::Atom { atom }, + KeywordValue::Choice { values } => Scanner::Choice { + scanners: values.into_iter().map(Into::into).collect(), + }, + } + } +} + impl KeywordValue { /// Collects all possible variations generated by this value. pub fn collect_variations(&self) -> Vec { diff --git a/crates/codegen/runtime/generator/src/parser/codegen.rs b/crates/codegen/runtime/generator/src/parser/codegen.rs index 5f4fdfcd34..a3d4597eb9 100644 --- a/crates/codegen/runtime/generator/src/parser/codegen.rs +++ b/crates/codegen/runtime/generator/src/parser/codegen.rs @@ -5,8 +5,7 @@ mod scanner_definition; mod trie; mod versioned; -pub use keyword_scanner_definition::KeywordScannerDefinitionCodegen; +pub use keyword_scanner_definition::{KeywordItemAtom, KeywordScannerDefinitionCodegen}; pub use parser_definition::ParserDefinitionCodegen; pub use precedence_parser_definition::PrecedenceParserDefinitionCodegen; -pub use scanner_definition::ScannerDefinitionCodegen; pub use trie::Trie; diff --git a/crates/codegen/runtime/generator/src/parser/codegen/keyword_scanner_definition.rs b/crates/codegen/runtime/generator/src/parser/codegen/keyword_scanner_definition.rs index 124a43f186..2b287427e8 100644 --- a/crates/codegen/runtime/generator/src/parser/codegen/keyword_scanner_definition.rs +++ b/crates/codegen/runtime/generator/src/parser/codegen/keyword_scanner_definition.rs @@ -1,22 +1,23 @@ +use std::rc::Rc; + use codegen_language_definition::model; use proc_macro2::TokenStream; use quote::{format_ident, quote}; -use crate::parser::codegen::scanner_definition::ScannerDefinitionNodeCodegen as _; +use crate::parser::codegen::scanner_definition::ScannerCodegen as _; use crate::parser::codegen::versioned::VersionedQuote; -use crate::parser::grammar::{KeywordScannerDefinitionRef, ScannerDefinitionNode}; pub trait KeywordScannerDefinitionCodegen { fn to_scanner_code(&self) -> TokenStream; } -impl KeywordScannerDefinitionCodegen for KeywordScannerDefinitionRef { +impl KeywordScannerDefinitionCodegen for model::KeywordItem { fn to_scanner_code(&self) -> TokenStream { - let name_ident = format_ident!("{}", self.name()); + let name_ident = format_ident!("{}", self.name); let terminal_kind = quote! { TerminalKind::#name_ident }; let kw_scanners: Vec<_> = self - .definitions() + .definitions .iter() .map(|versioned_kw| { let scanner = versioned_kw.value.to_scanner_code(); @@ -82,6 +83,55 @@ impl KeywordScannerDefinitionCodegen for KeywordScannerDefinitionRef { impl KeywordScannerDefinitionCodegen for model::KeywordValue { fn to_scanner_code(&self) -> TokenStream { // This is a subset; let's reuse that - ScannerDefinitionNode::from(self.clone()).to_scanner_code() + model::Scanner::from(self.clone()).to_scanner_code() + } +} + +/// A newtype wrapper around [`model::KeywordItem`] that only has a single atom value. +/// +/// The main usage for this type is to construct a keyword trie, as trie will +/// only work with single atom values and keyword promotion needs to additionally account for +/// keyword reservation, rather than just literal presence. +#[derive(Clone)] +pub struct KeywordItemAtom(Rc); + +impl KeywordItemAtom { + /// Wraps the keyword scanner definition if it is a single atom value. + pub fn try_from_def(def: &Rc) -> Option { + match def.definitions[..] { + [model::KeywordDefinition { + value: model::KeywordValue::Atom { .. }, + .. + }] => Some(Self(Rc::clone(def))), + _ => None, + } + } +} + +impl std::ops::Deref for KeywordItemAtom { + type Target = Rc; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl KeywordItemAtom { + pub fn definition(&self) -> &model::KeywordDefinition { + self.0 + .definitions + .first() + .expect("KeywordItemAtom should have exactly one definition") + } + + /// The single atom value that this keyword item matches. + pub fn value(&self) -> &str { + match self.definition() { + model::KeywordDefinition { + value: model::KeywordValue::Atom { atom }, + .. + } => atom, + _ => unreachable!("KeywordItemAtom should have a single atom value"), + } } } diff --git a/crates/codegen/runtime/generator/src/parser/codegen/parser_definition.rs b/crates/codegen/runtime/generator/src/parser/codegen/parser_definition.rs index 438680fc6c..7bd9ab753a 100644 --- a/crates/codegen/runtime/generator/src/parser/codegen/parser_definition.rs +++ b/crates/codegen/runtime/generator/src/parser/codegen/parser_definition.rs @@ -144,7 +144,7 @@ impl ParserDefinitionNodeCodegen for ParserDefinitionNode { // Keyword scanner uses the promotion inside the parse_terminal Self::KeywordScannerDefinition(scanner_definition) => { - let kind = format_ident!("{name}", name = scanner_definition.name()); + let kind = format_ident!("{name}", name = scanner_definition.name); let parse_terminal = if is_trivia { format_ident!("parse_terminal") diff --git a/crates/codegen/runtime/generator/src/parser/codegen/scanner_definition.rs b/crates/codegen/runtime/generator/src/parser/codegen/scanner_definition.rs index d34fafb89a..d04ab41163 100644 --- a/crates/codegen/runtime/generator/src/parser/codegen/scanner_definition.rs +++ b/crates/codegen/runtime/generator/src/parser/codegen/scanner_definition.rs @@ -1,132 +1,216 @@ use std::collections::BTreeSet; +use codegen_language_definition::model::{self, Identifier}; use inflector::Inflector; use proc_macro2::TokenStream; use quote::{format_ident, quote}; use crate::parser::codegen::versioned::VersionedQuote; -use crate::parser::grammar::{ScannerDefinitionNode, ScannerDefinitionRef}; +use crate::parser::grammar::ScannerDefinition; -pub trait ScannerDefinitionCodegen { - fn to_scanner_code(&self) -> TokenStream; - fn literals(&self) -> Vec; +impl ScannerDefinition for model::TriviaItem { + fn name(&self) -> &Identifier { + &self.name + } + + fn to_scanner_code(&self) -> proc_macro2::TokenStream { + self.scanner.to_scanner_code() + } + + fn literals(&self) -> Option> { + self.scanner.literals() + } } -impl ScannerDefinitionCodegen for ScannerDefinitionRef { - fn to_scanner_code(&self) -> TokenStream { - self.node().to_scanner_code() +impl ScannerDefinition for model::FragmentItem { + fn name(&self) -> &Identifier { + &self.name } - fn literals(&self) -> Vec { - let mut result = BTreeSet::new(); - if self.node().literals(&mut result) { - result.into_iter().collect() - } else { - vec![] + + fn to_scanner_code(&self) -> proc_macro2::TokenStream { + VersionedScanner::new(&self.scanner, self.enabled.as_ref()).to_scanner_code() + } + + fn literals(&self) -> Option> { + self.scanner.literals() + } + + fn version_specifier(&self) -> Option<&model::VersionSpecifier> { + self.enabled.as_ref() + } +} + +impl ScannerDefinition for model::TokenItem { + fn name(&self) -> &Identifier { + &self.name + } + + fn to_scanner_code(&self) -> proc_macro2::TokenStream { + let defs: Vec<_> = self + .definitions + .iter() + .map(|def| VersionedScanner::new(&def.scanner, def.enabled.as_ref())) + .collect(); + + match defs.len() { + 0 => panic!("Token {} has no definitions", self.name), + 1 => defs.into_iter().next().unwrap().to_scanner_code(), + _ => choice_to_scanner_code(&defs), } } + + fn literals(&self) -> Option> { + self.definitions + .iter() + .try_fold(BTreeSet::new(), |mut acc, def| { + let literals = def.scanner.literals()?; + acc.extend(literals); + Some(acc) + }) + } } -pub(super) trait ScannerDefinitionNodeCodegen { +pub(crate) trait ScannerCodegen { + /// Quotes the matching Rust scanner code. fn to_scanner_code(&self) -> TokenStream; - fn literals(&self, accum: &mut BTreeSet) -> bool; + /// Whether the scanner is an atom, and if so, returns the atom. + fn as_atom(&self) -> Option<&str>; + /// Returns a set of literals that this scanner can match. + fn literals(&self) -> Option>; } -impl ScannerDefinitionNodeCodegen for ScannerDefinitionNode { - // Returns true if this is nothing but a set of literals - fn literals(&self, accum: &mut BTreeSet) -> bool { - match self { - ScannerDefinitionNode::Versioned(body, _) => body.literals(accum), - ScannerDefinitionNode::Literal(string) => { - accum.insert(string.clone()); - true - } - ScannerDefinitionNode::Choice(nodes) => nodes - .iter() - .fold(true, |result, node| node.literals(accum) && result), - _ => false, - } +/// Enhances the [`model::Scanner`] with version information. +/// +/// Used to generate code for scanners that are versioned, i.e. wrapped in conditional blocks. +struct VersionedScanner<'a> { + scanner: &'a model::Scanner, + enabled: Option<&'a model::VersionSpecifier>, +} + +impl ScannerCodegen for VersionedScanner<'_> { + fn to_scanner_code(&self) -> TokenStream { + let scanner = self.scanner.to_scanner_code(); + self.enabled + .to_conditional_code(scanner, Some(quote! { false })) + } + + fn as_atom(&self) -> Option<&str> { + None + } + + fn literals(&self) -> Option> { + self.scanner.literals() } +} +impl<'a> VersionedScanner<'a> { + fn new(scanner: &'a model::Scanner, enabled: Option<&'a model::VersionSpecifier>) -> Self { + Self { scanner, enabled } + } +} + +impl ScannerCodegen for model::Scanner { fn to_scanner_code(&self) -> TokenStream { match self { - ScannerDefinitionNode::Versioned(body, version_quality_ranges) => { - let body = body.to_scanner_code(); - Some(version_quality_ranges).to_conditional_code(body, Some(quote! { false })) - } - - ScannerDefinitionNode::Optional(node) => { - let scanner = node.to_scanner_code(); + model::Scanner::Optional { scanner } => { + let scanner = scanner.to_scanner_code(); quote! { scan_optional!(input, #scanner) } } - - ScannerDefinitionNode::ZeroOrMore(node) => { - let scanner = node.to_scanner_code(); + model::Scanner::ZeroOrMore { scanner } => { + let scanner = scanner.to_scanner_code(); quote! { scan_zero_or_more!(input, #scanner) } } - ScannerDefinitionNode::OneOrMore(node) => { - let scanner = node.to_scanner_code(); + model::Scanner::OneOrMore { scanner } => { + let scanner = scanner.to_scanner_code(); quote! { scan_one_or_more!(input, #scanner) } } - - ScannerDefinitionNode::NoneOf(string) => { - let chars = string.chars(); + model::Scanner::Not { chars } => { + let chars = chars.iter(); quote! { scan_none_of!(input, #(#chars),*) } } - - ScannerDefinitionNode::NotFollowedBy(node, lookahead) => { + model::Scanner::TrailingContext { + scanner: node, + not_followed_by: lookahead, + } => { let scanner = node.to_scanner_code(); let negative_lookahead_scanner = lookahead.to_scanner_code(); quote! { scan_not_followed_by!(input, #scanner, #negative_lookahead_scanner) } } - - ScannerDefinitionNode::Sequence(nodes) => { - let scanners = nodes + model::Scanner::Sequence { scanners } => { + let scanners = scanners .iter() .map(|e| e.to_scanner_code()) .collect::>(); quote! { scan_sequence!(#(#scanners),*) } } + model::Scanner::Choice { scanners: nodes } => choice_to_scanner_code(nodes), - ScannerDefinitionNode::Choice(nodes) => { - let mut scanners = vec![]; - let mut non_literal_scanners = vec![]; - for node in nodes { - if let ScannerDefinitionNode::Literal(string) = node { - scanners.push(string); - } else { - non_literal_scanners.push(node.to_scanner_code()); - } - } - scanners.sort(); - let mut scanners = scanners - .iter() - // We want the longest literals first, so we prefer the longest match - .rev() - .map(|string| { - let chars = string.chars(); - quote! { scan_chars!(input, #(#chars),*) } - }) - .collect::>(); - scanners.extend(non_literal_scanners); - quote! { scan_choice!(input, #(#scanners),*) } - } - - ScannerDefinitionNode::CharRange(from, to) => { - quote! { scan_char_range!(input, #from..=#to) } + model::Scanner::Range { + inclusive_start: start, + inclusive_end: end, + } => { + quote! { scan_char_range!(input, #start..=#end) } } - - ScannerDefinitionNode::Literal(string) => { - let chars = string.chars(); + model::Scanner::Atom { atom } => { + let chars = atom.chars(); quote! { scan_chars!(input, #(#chars),*) } } - ScannerDefinitionNode::ScannerDefinition(scanner_definition) => { - let name = scanner_definition.name(); - let snake_case = name.to_snake_case(); + model::Scanner::Fragment { reference } => { + let snake_case = reference.to_snake_case(); let scanner_function_name = format_ident!("{snake_case}"); quote! { self.#scanner_function_name(input) } } } } + + fn as_atom(&self) -> Option<&str> { + match self { + model::Scanner::Atom { atom } => Some(atom), + _ => None, + } + } + + fn literals(&self) -> Option> { + fn accumulate(scanner: &model::Scanner, accum: &mut BTreeSet) -> bool { + match scanner { + model::Scanner::Atom { atom } => { + accum.insert(atom.clone()); + true + } + model::Scanner::Choice { scanners } => scanners + .iter() + .fold(true, |result, node| accumulate(node, accum) && result), + _ => false, + } + } + + let mut literals = BTreeSet::default(); + accumulate(self, &mut literals).then_some(literals) + } +} + +fn choice_to_scanner_code(nodes: &[T]) -> TokenStream { + let mut scanners = vec![]; + let mut non_literal_scanners = vec![]; + for node in nodes { + if let Some(atom) = node.as_atom() { + scanners.push(atom); + } else { + non_literal_scanners.push(node.to_scanner_code()); + } + } + scanners.sort_unstable(); + let mut scanners = scanners + .iter() + // We want the longest literals first, so we prefer the longest match + .rev() + .map(|string| { + let chars = string.chars(); + quote! { scan_chars!(input, #(#chars),*) } + }) + .collect::>(); + scanners.extend(non_literal_scanners); + quote! { scan_choice!(input, #(#scanners),*) } } diff --git a/crates/codegen/runtime/generator/src/parser/codegen/trie.rs b/crates/codegen/runtime/generator/src/parser/codegen/trie.rs index a568299832..e717c0a654 100644 --- a/crates/codegen/runtime/generator/src/parser/codegen/trie.rs +++ b/crates/codegen/runtime/generator/src/parser/codegen/trie.rs @@ -5,8 +5,9 @@ use codegen_language_definition::model::KeywordDefinition; use proc_macro2::TokenStream; use quote::{format_ident, quote}; -use crate::parser::codegen::versioned::{Versioned as _, VersionedQuote as _}; -use crate::parser::grammar::{KeywordScannerAtomic, ScannerDefinitionRef}; +use crate::parser::codegen::versioned::VersionedQuote as _; +use crate::parser::codegen::KeywordItemAtom; +use crate::parser::grammar::ScannerDefinitionRef; #[derive(Clone, Debug, Default)] pub struct Trie { @@ -93,7 +94,7 @@ impl Trie { /// Used together with [`Trie`]. Represents the payload of a trie node and can be used to customize /// the emitted code. /// -/// Implemented for [`ScannerDefinitionRef`] and [`KeywordScannerAtomic`], allows to create +/// Implemented for [`ScannerDefinitionRef`] and [`KeywordItemAtom`], allows to create /// tries for both literal scanner definitions and keyword scanners. pub trait Payload { fn to_leaf_code(&self) -> TokenStream; @@ -104,7 +105,7 @@ impl Payload for ScannerDefinitionRef { fn to_leaf_code(&self) -> TokenStream { let kind = format_ident!("{}", self.name()); - self.node().version_specifier().to_conditional_code( + self.version_specifier().to_conditional_code( quote! { Some(TerminalKind::#kind) }, Some(Self::default_case()), ) @@ -115,9 +116,9 @@ impl Payload for ScannerDefinitionRef { } } -impl Payload for KeywordScannerAtomic { +impl Payload for KeywordItemAtom { fn to_leaf_code(&self) -> TokenStream { - let kind = format_ident!("{}", self.name()); + let kind = format_ident!("{}", self.name); let KeywordDefinition { enabled, reserved, .. diff --git a/crates/codegen/runtime/generator/src/parser/codegen/versioned.rs b/crates/codegen/runtime/generator/src/parser/codegen/versioned.rs index 4d7cc1a93f..165d996235 100644 --- a/crates/codegen/runtime/generator/src/parser/codegen/versioned.rs +++ b/crates/codegen/runtime/generator/src/parser/codegen/versioned.rs @@ -3,7 +3,7 @@ use proc_macro2::TokenStream; use quote::{format_ident, quote}; use semver::Version; -use crate::parser::grammar::{Labeled, ParserDefinitionNode, ScannerDefinitionNode}; +use crate::parser::grammar::{Labeled, ParserDefinitionNode}; pub(super) trait Versioned { fn version_specifier(&self) -> Option<&VersionSpecifier>; @@ -23,22 +23,6 @@ impl Versioned for ParserDefinitionNode { } } -impl Versioned for ScannerDefinitionNode { - fn version_specifier(&self) -> Option<&VersionSpecifier> { - match self { - ScannerDefinitionNode::Versioned(_, version_quality_ranges) => { - Some(version_quality_ranges) - } - - ScannerDefinitionNode::Optional(node) - | ScannerDefinitionNode::ZeroOrMore(node) - | ScannerDefinitionNode::OneOrMore(node) => node.version_specifier(), - - _ => None, - } - } -} - pub(super) trait VersionedQuote { /// Depending on the `as_bool_expr` result, wraps the given code in an `if` block and optionally includes an `else` block fn to_conditional_code( diff --git a/crates/codegen/runtime/generator/src/parser/grammar.rs b/crates/codegen/runtime/generator/src/parser/grammar.rs index 998210c516..c6f3d1bb03 100644 --- a/crates/codegen/runtime/generator/src/parser/grammar.rs +++ b/crates/codegen/runtime/generator/src/parser/grammar.rs @@ -5,8 +5,9 @@ // module with the one from the new DSLv2 in the `constructor` module. use std::collections::HashMap; +use std::rc::Rc; -use codegen_language_definition::model::Identifier; +use codegen_language_definition::model::{self, Identifier}; pub mod constructor; pub mod parser_definition; @@ -37,7 +38,7 @@ impl Grammar { #[derive(Clone)] pub enum GrammarElement { ScannerDefinition(ScannerDefinitionRef), - KeywordScannerDefinition(KeywordScannerDefinitionRef), + KeywordScannerDefinition(Rc), TriviaParserDefinition(TriviaParserDefinitionRef), ParserDefinition(ParserDefinitionRef), PrecedenceParserDefinition(PrecedenceParserDefinitionRef), @@ -71,7 +72,7 @@ impl Visitable for GrammarElement { fn accept_visitor(&self, visitor: &mut V) { match self { Self::ScannerDefinition(scanner) => scanner.accept_visitor(visitor), - Self::KeywordScannerDefinition(scanner) => scanner.accept_visitor(visitor), + Self::KeywordScannerDefinition(_) => {} Self::TriviaParserDefinition(trivia_parser) => trivia_parser.accept_visitor(visitor), Self::ParserDefinition(parser) => parser.accept_visitor(visitor), Self::PrecedenceParserDefinition(precedence_parser) => { diff --git a/crates/codegen/runtime/generator/src/parser/grammar/constructor.rs b/crates/codegen/runtime/generator/src/parser/grammar/constructor.rs index ead12ad308..573f802aed 100644 --- a/crates/codegen/runtime/generator/src/parser/grammar/constructor.rs +++ b/crates/codegen/runtime/generator/src/parser/grammar/constructor.rs @@ -12,9 +12,8 @@ use indexmap::IndexMap; use once_cell::sync::Lazy; use crate::parser::grammar::{ - DelimitedRecoveryTerminalThreshold, Grammar, GrammarElement, KeywordScannerDefinition, Labeled, - ParserDefinition, ParserDefinitionNode, PrecedenceParserDefinition, - PrecedenceParserDefinitionNode, ScannerDefinition, ScannerDefinitionNode, + DelimitedRecoveryTerminalThreshold, Grammar, GrammarElement, Labeled, ParserDefinition, + ParserDefinitionNode, PrecedenceParserDefinition, PrecedenceParserDefinitionNode, TriviaParserDefinition, }; @@ -112,42 +111,6 @@ impl Grammar { } } -#[derive(Debug)] -struct NamedScanner { - name: Identifier, - def: ScannerDefinitionNode, -} - -impl ScannerDefinition for NamedScanner { - fn name(&self) -> &Identifier { - &self.name - } - fn node(&self) -> &ScannerDefinitionNode { - &self.def - } -} - -#[derive(Debug)] -struct NamedKeywordScanner { - name: Identifier, - identifier_scanner_name: Identifier, - defs: Vec, -} - -impl KeywordScannerDefinition for NamedKeywordScanner { - fn name(&self) -> &Identifier { - &self.name - } - - fn definitions(&self) -> &[model::KeywordDefinition] { - &self.defs - } - - fn identifier_scanner(&self) -> &Identifier { - &self.identifier_scanner_name - } -} - #[derive(Debug)] struct NamedTriviaParser { name: Identifier, @@ -338,34 +301,20 @@ fn resolve_grammar_element(ident: &Identifier, ctx: &mut ResolveCtx<'_>) -> Gram // First time resolving a terminal named `ident` (None, None) => { let named_scanner = match elem { - Item::Trivia { item } => NamedScanner { - name: ident.clone(), - def: resolve_scanner(item.scanner.clone(), ctx), - }, - Item::Fragment { item } => NamedScanner { - name: ident.clone(), - def: resolve_fragment(item.deref().clone(), ctx), - }, - Item::Token { item } => NamedScanner { - name: ident.clone(), - def: resolve_token(item.deref().clone(), ctx), - }, + Item::Trivia { item } => Rc::clone(item) as Rc<_>, + Item::Fragment { item } => Rc::clone(item) as Rc<_>, + Item::Token { item } => Rc::clone(item) as Rc<_>, Item::Keyword { item } => { - let kw_scanner = NamedKeywordScanner { - name: ident.clone(), - identifier_scanner_name: item.identifier.clone(), - defs: item.definitions.clone(), - }; - // Keywords are special scanners and are handled separately - let resolved = GrammarElement::KeywordScannerDefinition(Rc::new(kw_scanner)); + let resolved = + GrammarElement::KeywordScannerDefinition(Rc::clone(item) as Rc<_>); ctx.resolved.insert(ident.clone(), resolved.clone()); return resolved; } _ => unreachable!("Only terminals can be resolved here"), }; - let resolved = GrammarElement::ScannerDefinition(Rc::new(named_scanner)); + let resolved = GrammarElement::ScannerDefinition(named_scanner); ctx.resolved.insert(ident.clone(), resolved.clone()); resolved @@ -373,72 +322,6 @@ fn resolve_grammar_element(ident: &Identifier, ctx: &mut ResolveCtx<'_>) -> Gram } } -fn resolve_scanner(scanner: model::Scanner, ctx: &mut ResolveCtx<'_>) -> ScannerDefinitionNode { - match scanner { - model::Scanner::Optional { scanner } => { - ScannerDefinitionNode::Optional(Box::new(resolve_scanner(*scanner, ctx))) - } - model::Scanner::ZeroOrMore { scanner } => { - ScannerDefinitionNode::ZeroOrMore(Box::new(resolve_scanner(*scanner, ctx))) - } - model::Scanner::OneOrMore { scanner } => { - ScannerDefinitionNode::OneOrMore(Box::new(resolve_scanner(*scanner, ctx))) - } - model::Scanner::Sequence { scanners } => ScannerDefinitionNode::Sequence( - scanners - .into_iter() - .map(|scanner| resolve_scanner(scanner, ctx)) - .collect(), - ), - model::Scanner::Choice { scanners } => ScannerDefinitionNode::Choice( - scanners - .into_iter() - .map(|scanner| resolve_scanner(scanner, ctx)) - .collect(), - ), - model::Scanner::Not { chars } => ScannerDefinitionNode::NoneOf(chars.into_iter().collect()), - model::Scanner::TrailingContext { - scanner, - not_followed_by, - } => ScannerDefinitionNode::NotFollowedBy( - Box::new(resolve_scanner(*scanner, ctx)), - Box::new(resolve_scanner(*not_followed_by, ctx)), - ), - model::Scanner::Range { - inclusive_start, - inclusive_end, - } => ScannerDefinitionNode::CharRange(inclusive_start, inclusive_end), - model::Scanner::Atom { atom } => ScannerDefinitionNode::Literal(atom), - model::Scanner::Fragment { reference } => match resolve_grammar_element(&reference, ctx) { - GrammarElement::ScannerDefinition(parser) => { - ScannerDefinitionNode::ScannerDefinition(parser) - } - _ => panic!("Expected {reference} to be a ScannerDefinition"), - }, - } -} - -fn resolve_fragment( - fragment: model::FragmentItem, - ctx: &mut ResolveCtx<'_>, -) -> ScannerDefinitionNode { - resolve_scanner(fragment.scanner, ctx).versioned(fragment.enabled) -} - -fn resolve_token(token: model::TokenItem, ctx: &mut ResolveCtx<'_>) -> ScannerDefinitionNode { - let resolved_defs: Vec<_> = token - .definitions - .into_iter() - .map(|def| resolve_scanner(def.scanner, ctx).versioned(def.enabled)) - .collect(); - - match resolved_defs.len() { - 0 => panic!("Token {} has no definitions", token.name), - 1 => resolved_defs.into_iter().next().unwrap(), - _ => ScannerDefinitionNode::Choice(resolved_defs), - } -} - fn resolve_trivia( parser: model::TriviaParser, kind: TriviaKind, @@ -775,16 +658,6 @@ impl VersionWrapped for ParserDefinitionNode { } } -impl VersionWrapped for ScannerDefinitionNode { - fn versioned(self, enabled: Option) -> Self { - if let Some(enabled) = enabled { - Self::Versioned(Box::new(self), enabled) - } else { - self - } - } -} - trait LabeledExt { fn anonymous(node: T) -> Self; fn with_ident_name(name: Identifier, node: T) -> Self; diff --git a/crates/codegen/runtime/generator/src/parser/grammar/parser_definition.rs b/crates/codegen/runtime/generator/src/parser/grammar/parser_definition.rs index 7c5466b49e..783bbc2d4e 100644 --- a/crates/codegen/runtime/generator/src/parser/grammar/parser_definition.rs +++ b/crates/codegen/runtime/generator/src/parser/grammar/parser_definition.rs @@ -4,9 +4,7 @@ use std::rc::Rc; use codegen_language_definition::model::{self, Identifier}; use crate::parser::grammar::visitor::{GrammarVisitor, Visitable}; -use crate::parser::grammar::{ - KeywordScannerDefinitionRef, PrecedenceParserDefinitionRef, ScannerDefinitionRef, -}; +use crate::parser::grammar::{PrecedenceParserDefinitionRef, ScannerDefinitionRef}; /// A named wrapper, used to give a name to a [`ParserDefinitionNode`]. #[derive(Clone, Debug)] @@ -82,7 +80,7 @@ pub enum ParserDefinitionNode { Sequence(Vec>), Choice(Labeled>), ScannerDefinition(ScannerDefinitionRef), - KeywordScannerDefinition(KeywordScannerDefinitionRef), + KeywordScannerDefinition(Rc), TriviaParserDefinition(TriviaParserDefinitionRef), ParserDefinition(ParserDefinitionRef), PrecedenceParserDefinition(PrecedenceParserDefinitionRef), diff --git a/crates/codegen/runtime/generator/src/parser/grammar/scanner_definition.rs b/crates/codegen/runtime/generator/src/parser/grammar/scanner_definition.rs index b964c67b01..a07e26b499 100644 --- a/crates/codegen/runtime/generator/src/parser/grammar/scanner_definition.rs +++ b/crates/codegen/runtime/generator/src/parser/grammar/scanner_definition.rs @@ -1,13 +1,25 @@ +use std::collections::BTreeSet; use std::fmt::Debug; use std::rc::Rc; use codegen_language_definition::model::{self, Identifier}; +use proc_macro2::TokenStream; use crate::parser::grammar::{GrammarVisitor, Visitable}; pub trait ScannerDefinition: Debug { + /// A unique identifier for this scanner. fn name(&self) -> &Identifier; - fn node(&self) -> &ScannerDefinitionNode; + /// Quotes the matching Rust scanner code. + fn to_scanner_code(&self) -> TokenStream; + /// A set of literals that this scanner can match. + /// + /// If the scanner matches more than just (a union of) literals, this method should return `None`. + fn literals(&self) -> Option>; + /// For which language version the scanner is defined. + fn version_specifier(&self) -> Option<&model::VersionSpecifier> { + None + } } pub type ScannerDefinitionRef = Rc; @@ -15,134 +27,5 @@ pub type ScannerDefinitionRef = Rc; impl Visitable for ScannerDefinitionRef { fn accept_visitor(&self, visitor: &mut V) { visitor.scanner_definition_enter(self); - self.node().accept_visitor(visitor); - } -} - -#[derive(Clone, Debug)] -pub enum ScannerDefinitionNode { - Versioned(Box, model::VersionSpecifier), - Optional(Box), - ZeroOrMore(Box), - OneOrMore(Box), - Sequence(Vec), - Choice(Vec), - NoneOf(String), - NotFollowedBy(Box, Box), - CharRange(char, char), - Literal(String), - ScannerDefinition(ScannerDefinitionRef), -} - -impl From for ScannerDefinitionNode { - fn from(def_ref: ScannerDefinitionRef) -> Self { - ScannerDefinitionNode::ScannerDefinition(def_ref) - } -} - -impl Visitable for ScannerDefinitionNode { - fn accept_visitor(&self, visitor: &mut V) { - visitor.scanner_definition_node_enter(self); - match self { - Self::Versioned(node, _) - | Self::Optional(node) - | Self::ZeroOrMore(node) - | Self::OneOrMore(node) => node.accept_visitor(visitor), - - Self::Sequence(nodes) | Self::Choice(nodes) => { - for node in nodes { - node.accept_visitor(visitor); - } - } - - Self::NotFollowedBy(node, lookahead) => { - node.accept_visitor(visitor); - lookahead.accept_visitor(visitor); - } - - Self::NoneOf(_) - | Self::CharRange(_, _) - | Self::Literal(_) - | Self::ScannerDefinition(_) => {} - } - } -} - -pub trait KeywordScannerDefinition: Debug { - fn name(&self) -> &Identifier; - fn identifier_scanner(&self) -> &Identifier; - fn definitions(&self) -> &[model::KeywordDefinition]; -} - -pub type KeywordScannerDefinitionRef = Rc; - -impl Visitable for KeywordScannerDefinitionRef { - fn accept_visitor(&self, visitor: &mut V) { - visitor.keyword_scanner_definition_enter(self); - } -} - -impl From for ScannerDefinitionNode { - fn from(val: model::KeywordValue) -> Self { - match val { - model::KeywordValue::Optional { value } => { - ScannerDefinitionNode::Optional(Box::new((*value).into())) - } - model::KeywordValue::Sequence { values } => { - ScannerDefinitionNode::Sequence(values.into_iter().map(Into::into).collect()) - } - model::KeywordValue::Atom { atom } => ScannerDefinitionNode::Literal(atom), - model::KeywordValue::Choice { values } => { - ScannerDefinitionNode::Choice(values.into_iter().map(Into::into).collect()) - } - } - } -} - -/// A [`KeywordScannerDefinitionRef`] that only has a single atom value. -/// -/// The main usage for this type is to construct a keyword trie in parser generator, as trie will -/// only work with single atom values and keyword promotion needs to additionally account for -/// keyword reservation, rather than just literal presence. -#[derive(Clone)] -pub struct KeywordScannerAtomic(KeywordScannerDefinitionRef); - -impl KeywordScannerAtomic { - /// Wraps the keyword scanner definition if it is a single atom value. - pub fn try_from_def(def: &KeywordScannerDefinitionRef) -> Option { - match def.definitions() { - [model::KeywordDefinition { - value: model::KeywordValue::Atom { .. }, - .. - }] => Some(Self(Rc::clone(def))), - _ => None, - } - } -} - -impl std::ops::Deref for KeywordScannerAtomic { - type Target = KeywordScannerDefinitionRef; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl KeywordScannerAtomic { - pub fn definition(&self) -> &model::KeywordDefinition { - self.0 - .definitions() - .first() - .expect("KeywordScannerAtomic should have exactly one definition") - } - - pub fn value(&self) -> &str { - match self.definition() { - model::KeywordDefinition { - value: model::KeywordValue::Atom { atom }, - .. - } => atom, - _ => unreachable!("KeywordScannerAtomic should have a single atom value"), - } } } diff --git a/crates/codegen/runtime/generator/src/parser/grammar/visitor.rs b/crates/codegen/runtime/generator/src/parser/grammar/visitor.rs index 5256684180..40266fec47 100644 --- a/crates/codegen/runtime/generator/src/parser/grammar/visitor.rs +++ b/crates/codegen/runtime/generator/src/parser/grammar/visitor.rs @@ -1,7 +1,6 @@ use crate::parser::grammar::{ - Grammar, KeywordScannerDefinitionRef, ParserDefinitionNode, ParserDefinitionRef, - PrecedenceParserDefinitionNode, PrecedenceParserDefinitionRef, ScannerDefinitionNode, - ScannerDefinitionRef, TriviaParserDefinitionRef, + Grammar, ParserDefinitionNode, ParserDefinitionRef, PrecedenceParserDefinitionNode, + PrecedenceParserDefinitionRef, ScannerDefinitionRef, TriviaParserDefinitionRef, }; pub trait GrammarVisitor { @@ -9,12 +8,10 @@ pub trait GrammarVisitor { fn grammar_leave(&mut self, _grammar: &Grammar) {} fn scanner_definition_enter(&mut self, _scanner: &ScannerDefinitionRef) {} - fn keyword_scanner_definition_enter(&mut self, _scanner: &KeywordScannerDefinitionRef) {} fn trivia_parser_definition_enter(&mut self, _trivia_parser: &TriviaParserDefinitionRef) {} fn parser_definition_enter(&mut self, _parser: &ParserDefinitionRef) {} fn precedence_parser_definition_enter(&mut self, _parser: &PrecedenceParserDefinitionRef) {} - fn scanner_definition_node_enter(&mut self, _node: &ScannerDefinitionNode) {} fn parser_definition_node_enter(&mut self, _node: &ParserDefinitionNode) {} fn precedence_parser_definition_node_enter(&mut self, _node: &PrecedenceParserDefinitionNode) {} } diff --git a/crates/codegen/runtime/generator/src/parser/mod.rs b/crates/codegen/runtime/generator/src/parser/mod.rs index 637480f055..7a55c6a045 100644 --- a/crates/codegen/runtime/generator/src/parser/mod.rs +++ b/crates/codegen/runtime/generator/src/parser/mod.rs @@ -3,7 +3,7 @@ use std::collections::{BTreeMap, BTreeSet}; use std::rc::Rc; -use codegen_language_definition::model::{Identifier, Language}; +use codegen_language_definition::model::{self, Identifier, Language}; use serde::Serialize; mod codegen; @@ -11,14 +11,15 @@ mod grammar; use codegen::{ KeywordScannerDefinitionCodegen as _, ParserDefinitionCodegen as _, - PrecedenceParserDefinitionCodegen as _, ScannerDefinitionCodegen as _, Trie, + PrecedenceParserDefinitionCodegen as _, Trie, }; use grammar::{ - Grammar, GrammarVisitor, KeywordScannerAtomic, KeywordScannerDefinitionRef, - ParserDefinitionNode, ParserDefinitionRef, PrecedenceParserDefinitionRef, ScannerDefinitionRef, - TriviaParserDefinitionRef, + Grammar, GrammarVisitor, ParserDefinitionNode, ParserDefinitionRef, + PrecedenceParserDefinitionRef, ScannerDefinitionRef, TriviaParserDefinitionRef, }; +use crate::parser::codegen::KeywordItemAtom; + /// Newtype for the already generated Rust code, not to be confused with regular strings. #[derive(Serialize, Default, Clone)] struct RustCode(String); @@ -78,7 +79,7 @@ struct ScannerContextAccumulatorState { /// Set of delimiter pairs for this context that are used in delimited error recovery. delimiters: BTreeMap, scanner_definitions: BTreeSet, - keyword_scanner_defs: BTreeMap, + keyword_scanner_defs: BTreeMap>, } impl ParserModel { @@ -121,7 +122,7 @@ impl ParserAccumulatorState { for scanner_name in &context.scanner_definitions { let scanner = &self.all_scanners[scanner_name]; - let literals = scanner.literals(); + let literals = scanner.literals().unwrap_or_default(); if literals.is_empty() { acc.compound_scanner_names.push(scanner_name.clone()); } else { @@ -135,12 +136,12 @@ impl ParserAccumulatorState { acc.promotable_identifier_scanners = context .keyword_scanner_defs .values() - .map(|def| def.identifier_scanner().clone()) + .map(|def| def.identifier.clone()) .collect(); let mut keyword_trie = Trie::new(); for (name, def) in &context.keyword_scanner_defs { - match KeywordScannerAtomic::try_from_def(def) { + match KeywordItemAtom::try_from_def(def) { Some(atomic) => keyword_trie.insert(atomic.value(), atomic.clone()), None => { acc.keyword_compound_scanners @@ -161,7 +162,7 @@ impl ParserAccumulatorState { .iter() .filter(|(name, scanner)| { // are compound (do not consist of only literals) - scanner.literals().is_empty() || + scanner.literals().is_none() || // but make sure to also include a scanner that is referenced by other scanners, even if not compound !self.top_level_scanner_names.contains(*name) }) @@ -249,7 +250,7 @@ impl GrammarVisitor for ParserAccumulatorState { ParserDefinitionNode::KeywordScannerDefinition(scanner) => { self.current_context() .keyword_scanner_defs - .insert(scanner.name().clone(), Rc::clone(scanner)); + .insert(scanner.name.clone(), Rc::clone(scanner)); } // Collect delimiters for each context