Skip to content

Commit

Permalink
Lexer code deduplication and refactoring (#414)
Browse files Browse the repository at this point in the history
  • Loading branch information
casey authored Apr 18, 2019
1 parent 0ad5574 commit d065d1c
Showing 1 changed file with 59 additions and 42 deletions.
101 changes: 59 additions & 42 deletions src/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,21 @@ impl<'a> Lexer<'a> {
&self.text[self.token_start.offset..self.token_end.offset]
}

/// Length of current token
fn current_token_length(&self) -> usize {
self.token_end.offset - self.token_start.offset
}

/// Is next character c?
fn next_is(&self, c: char) -> bool {
self.next == Some(c)
}

/// Is next character ' ' or '\t'?
fn next_is_whitespace(&self) -> bool {
self.next_is(' ') || self.next_is('\t')
}

/// Un-lexed text
fn rest(&self) -> &'a str {
&self.text[self.token_end.offset..]
Expand All @@ -95,9 +110,14 @@ impl<'a> Lexer<'a> {
self.rest().starts_with(prefix)
}

/// Length of current token
fn current_token_length(&self) -> usize {
self.token_end.offset - self.token_start.offset
/// Does rest start with "\n" or "\r\n"?
fn at_eol(&self) -> bool {
self.next_is('\n') || self.rest_starts_with("\r\n")
}

/// Are we at end-of-line or end-of-file?
fn at_eol_or_eof(&self) -> bool {
self.at_eol() || self.rest().is_empty()
}

/// Get current state
Expand Down Expand Up @@ -237,7 +257,7 @@ impl<'a> Lexer<'a> {

// Handle blank line
if rest.starts_with('\n') || rest.starts_with("\r\n") || rest.is_empty() {
while let Some(' ') | Some('\t') = self.next {
while self.next_is_whitespace() {
self.advance()?;
}

Expand All @@ -250,7 +270,7 @@ impl<'a> Lexer<'a> {
}

// Handle nonblank lines with no leading whitespace
if self.next != Some(' ') && self.next != Some('\t') {
if !self.next_is_whitespace() {
if let State::Indented { .. } = self.state()? {
self.token(Dedent);
self.pop_state()?;
Expand All @@ -261,30 +281,33 @@ impl<'a> Lexer<'a> {

// Handle continued indentation
if let State::Indented { indentation } = self.state()? {
let mut remaining = indentation.len();

// Advance over whitespace up to length of current indentation
while let Some(' ') | Some('\t') = self.next {
self.advance()?;
remaining -= 1;
if remaining == 0 {
break;
if self.rest_starts_with(indentation) {
for _ in indentation.chars() {
self.advance()?;
}
}

let lexeme = self.lexeme();
// Indentation matches, lex as whitespace
self.token(Whitespace);

if lexeme != indentation {
return Err(self.error(InconsistentLeadingWhitespace {
expected: indentation,
found: lexeme,
}));
return Ok(());
}

// Indentation matches, lex as whitespace
self.token(Whitespace);
// Consume whitespace characters, matching or not, up to the length
// of expected indentation
for _ in indentation.chars().zip(self.rest().chars()) {
if self.next_is_whitespace() {
self.advance()?;
} else {
break;
}
}

return Ok(());
// We've either advanced over not enough whitespace or mismatching
// whitespace, so return an error
return Err(self.error(InconsistentLeadingWhitespace {
expected: indentation,
found: self.lexeme(),
}));
}

if self.state()? != State::Normal {
Expand All @@ -295,7 +318,7 @@ impl<'a> Lexer<'a> {
}

// Handle new indentation
while let Some(' ') | Some('\t') = self.next {
while self.next_is_whitespace() {
self.advance()?;
}

Expand Down Expand Up @@ -356,7 +379,7 @@ impl<'a> Lexer<'a> {
self.pop_state()?;
// Emit interpolation end token
self.lex_double(InterpolationEnd)
} else if self.rest_starts_with("\n") || self.rest_starts_with("\r\n") {
} else if self.at_eol_or_eof() {
// Return unterminated interpolation error that highlights the opening {{
Err(self.unterminated_interpolation_error(interpolation_start))
} else {
Expand Down Expand Up @@ -446,7 +469,7 @@ impl<'a> Lexer<'a> {
fn lex_colon(&mut self) -> CompilationResult<'a, ()> {
self.advance()?;

if let Some('=') = self.next {
if self.next_is('=') {
self.advance()?;
self.token(ColonEquals);
} else {
Expand Down Expand Up @@ -492,8 +515,10 @@ impl<'a> Lexer<'a> {

/// Lex name: [a-zA-Z_][a-zA-Z0-9_]*
fn lex_name(&mut self) -> CompilationResult<'a, ()> {
while let Some('a'...'z') | Some('A'...'Z') | Some('0'...'9') | Some('_') | Some('-') =
self.next
while self
.next
.map(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
.unwrap_or(false)
{
self.advance()?;
}
Expand All @@ -508,11 +533,7 @@ impl<'a> Lexer<'a> {
// advance over #
self.advance()?;

loop {
if let Some('\r') | Some('\n') | None = self.next {
break;
}

while !self.at_eol_or_eof() {
self.advance()?;
}

Expand All @@ -523,30 +544,26 @@ impl<'a> Lexer<'a> {

/// Lex backtick: `[^\r\n]*`
fn lex_backtick(&mut self) -> CompilationResult<'a, ()> {
// advance over `
// advance over initial `
self.advance()?;

loop {
if let Some('\r') | Some('\n') | None = self.next {
while !self.next_is('`') {
if self.at_eol_or_eof() {
return Err(self.error(UnterminatedBacktick));
}

if let Some('`') = self.next {
self.advance()?;
break;
}

self.advance()?;
}

self.advance()?;
self.token(Backtick);

Ok(())
}

/// Lex whitespace: [ \t]+
fn lex_whitespace(&mut self) -> CompilationResult<'a, ()> {
while let Some(' ') | Some('\t') = self.next {
while self.next_is_whitespace() {
self.advance()?
}

Expand Down

0 comments on commit d065d1c

Please sign in to comment.