diff --git a/parser/src/earley/lexer.rs b/parser/src/earley/lexer.rs index 00f8e1ba..3210b2d4 100644 --- a/parser/src/earley/lexer.rs +++ b/parser/src/earley/lexer.rs @@ -27,11 +27,14 @@ pub struct Lexer { pub type StateID = derivre::StateID; +/// PreLexeme contains index of the lexeme but not the bytes. #[derive(Debug, Clone, Copy)] pub struct PreLexeme { pub idx: LexemeIdx, pub byte: Option, + /// Does the 'byte' above belong to the next lexeme? pub byte_next_row: bool, + /// Length in bytes of the hidden part of the lexeme. pub hidden_len: usize, } diff --git a/parser/src/earley/lexerspec.rs b/parser/src/earley/lexerspec.rs index 5d0d5860..b6ab6c61 100644 --- a/parser/src/earley/lexerspec.rs +++ b/parser/src/earley/lexerspec.rs @@ -28,6 +28,9 @@ pub struct LexemeSpec { json_options: Option, } +/// LexemeIdx is an index into the lexeme table. +/// It corresponds to a category like IDENTIFIER or STRING, +/// or to a very specific lexeme like WHILE or MULTIPLY. #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] pub struct LexemeIdx(usize); diff --git a/parser/src/earley/parser.rs b/parser/src/earley/parser.rs index 20611b51..59db589e 100644 --- a/parser/src/earley/parser.rs +++ b/parser/src/earley/parser.rs @@ -1355,8 +1355,8 @@ impl ParserState { self.grammar.lexer_spec() } - // mk_lexeme() converts the pre-lexemes for the current row into - // a lexeme, and returns it. + // mk_lexeme() converts a pre-lexeme for the current row into + // a lexeme (ie., it determines the bytes that go into the lexeme), and returns it. #[inline(always)] fn mk_lexeme(&self, byte: Option, pre_lexeme: PreLexeme) -> Lexeme { let mut bytes = self.curr_row_bytes(); @@ -1501,11 +1501,13 @@ impl ParserState { // This is never inlined anyways, so better make it formal #[inline(never)] fn advance_parser(&mut self, shared: &mut SharedState, pre_lexeme: PreLexeme) -> bool { + // this byte will be applied to the next lexeme let transition_byte = if pre_lexeme.byte_next_row { pre_lexeme.byte } else { None }; + // this is the last byte of the lexeme let lexeme_byte = if pre_lexeme.byte_next_row { None } else {