-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: create SymbolIterator for block parsing (#106)
* feat: create SymbolIterator * feat: switch block parser to SymbolIterator * feat: add itertools for SymbolIterator * feat: switch to nesting symbol iterators * fix: add prefix line test for symbol iterator * feat: simplify iterator nesting parsers * fix: correct heading end closure to detect heading * fix: ignore newlines between elements * feat: make end-fn optional for new symbol iterator * fix: change end fns to get SymboliterMatcher * fix: remove new_line from SymbolIterRoot * fix: remove remaining symbols from tokenize output * fix: correct prefix consumption for symbol iterator * fix: fix endless loop in peeking_next() * fix: correct iterator length calculation * fix: prevent plain from merging with newline token * fix: implement rendering for whitespace inlines * fix: add comment why reset_peek() is needed * fix: update verbatim to work with symbol iterator * arch: split iterator into multiple files * fix: add documentation for the symbol iterator * feat: add nesting depth to symbol iterator * fix: add EOI symbol to match end as empty line * fix: remove EOI symbol for lexer tests * fix: pin zerovec crate to specific version * fix: resolve icu dependency problems check in lock file to prevent this in the future * feat: update icu to not need any generated data * fix: remove crate_authors!() due to clippy warning Behavior remains the same, because this was the default anyways. * chore: remove lock file from vc after icu bump * fix: add blankline for better readability Co-authored-by: Nadir Fejzić <[email protected]> * fix: use `debug_assert!()` instead of `cfg(debug_assertions)` Co-authored-by: Nadir Fejzić <[email protected]> * fix: make peeking_next() more compact * fix: use owned Vec to create Paragraph from * fix: use `iter::once()` to create end sequence Co-authored-by: Nadir Fejzić <[email protected]> * fix: remove double dot at end of sentence Co-authored-by: Nadir Fejzić <[email protected]> * fix: map length before unwrap of remaining_symbols Co-authored-by: Nadir Fejzić <[email protected]> * fix: improve comments for SymbolIterator * fix: remove Scanner struct Provide `scan_str()` as standalone function. * fix: restrict visibility of iterator index fns * fix: remove duplicate From<> impls for iterators * fix: remove *curr* prefix for iterator functions * fix: remove *curr* prefix from index in root iterator * fix: add assert to ensure update done on act parent Assert only in debug mode. --------- Co-authored-by: Nadir Fejzic <[email protected]>
- Loading branch information
Showing
19 changed files
with
1,051 additions
and
256 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,81 +1,74 @@ | ||
//! Scanner and helper types and traits for structurization of Unimarkup input. | ||
//! Functionality, iterators, helper types and traits to get [`Symbol`]s from `&str`. | ||
//! These [`Symbol`]s and iterators are used to convert the input into a Unimarkup document. | ||
use icu_segmenter::GraphemeClusterSegmenter; | ||
|
||
pub mod position; | ||
pub mod span; | ||
mod symbol; | ||
|
||
use icu_segmenter::GraphemeClusterSegmenter; | ||
use position::{Offset, Position}; | ||
pub use symbol::{Symbol, SymbolKind}; | ||
|
||
#[derive(Debug)] | ||
pub struct Scanner { | ||
segmenter: GraphemeClusterSegmenter, | ||
} | ||
|
||
impl Clone for Scanner { | ||
fn clone(&self) -> Self { | ||
let segmenter = GraphemeClusterSegmenter::new(); | ||
|
||
Self { segmenter } | ||
} | ||
} | ||
|
||
impl Default for Scanner { | ||
fn default() -> Self { | ||
let segmenter = GraphemeClusterSegmenter::new(); | ||
use position::{Offset, Position as SymPos}; | ||
pub use symbol::{iterator::*, Symbol, SymbolKind}; | ||
|
||
Self { segmenter } | ||
} | ||
} | ||
|
||
impl Scanner { | ||
pub fn scan_str<'s>(&self, input: &'s str) -> Vec<Symbol<'s>> { | ||
let mut symbols: Vec<Symbol> = Vec::new(); | ||
let mut curr_pos: Position = Position::default(); | ||
let mut prev_offset = 0; | ||
/// Scans given input and returns vector of [`Symbol`]s needed to convert the input to Unimarkup content. | ||
pub fn scan_str(input: &str) -> Vec<Symbol<'_>> { | ||
let segmenter = GraphemeClusterSegmenter::new(); | ||
|
||
// skip(1) to ignore break at start of input | ||
for offset in self.segmenter.segment_str(input).skip(1) { | ||
if let Some(grapheme) = input.get(prev_offset..offset) { | ||
let mut kind = SymbolKind::from(grapheme); | ||
let mut symbols: Vec<Symbol> = Vec::new(); | ||
let mut curr_pos: SymPos = SymPos::default(); | ||
let mut prev_offset = 0; | ||
|
||
let end_pos = if kind == SymbolKind::Newline { | ||
Position { | ||
line: (curr_pos.line + 1), | ||
..Default::default() | ||
} | ||
} else { | ||
Position { | ||
line: curr_pos.line, | ||
col_utf8: (curr_pos.col_utf8 + grapheme.len()), | ||
col_utf16: (curr_pos.col_utf16 + grapheme.encode_utf16().count()), | ||
col_grapheme: (curr_pos.col_grapheme + 1), | ||
} | ||
}; | ||
// skip(1) to ignore break at start of input | ||
for offset in segmenter.segment_str(input).skip(1) { | ||
if let Some(grapheme) = input.get(prev_offset..offset) { | ||
let mut kind = SymbolKind::from(grapheme); | ||
|
||
if curr_pos.col_utf8 == 1 && kind == SymbolKind::Newline { | ||
// newline at the start of line -> Blankline | ||
kind = SymbolKind::Blankline; | ||
let end_pos = if kind == SymbolKind::Newline { | ||
SymPos { | ||
line: (curr_pos.line + 1), | ||
..Default::default() | ||
} | ||
} else { | ||
SymPos { | ||
line: curr_pos.line, | ||
col_utf8: (curr_pos.col_utf8 + grapheme.len()), | ||
col_utf16: (curr_pos.col_utf16 + grapheme.encode_utf16().count()), | ||
col_grapheme: (curr_pos.col_grapheme + 1), | ||
} | ||
}; | ||
|
||
symbols.push(Symbol { | ||
input, | ||
kind, | ||
offset: Offset { | ||
start: prev_offset, | ||
end: offset, | ||
}, | ||
start: curr_pos, | ||
end: end_pos, | ||
}); | ||
|
||
curr_pos = end_pos; | ||
if curr_pos.col_utf8 == 1 && kind == SymbolKind::Newline { | ||
// newline at the start of line -> Blankline | ||
kind = SymbolKind::Blankline; | ||
} | ||
prev_offset = offset; | ||
} | ||
|
||
// last offset not needed, because break at EOI is always available | ||
symbols | ||
symbols.push(Symbol { | ||
input, | ||
kind, | ||
offset: Offset { | ||
start: prev_offset, | ||
end: offset, | ||
}, | ||
start: curr_pos, | ||
end: end_pos, | ||
}); | ||
|
||
curr_pos = end_pos; | ||
} | ||
prev_offset = offset; | ||
} | ||
|
||
symbols.push(Symbol { | ||
input, | ||
kind: SymbolKind::EOI, | ||
offset: Offset { | ||
start: prev_offset, | ||
end: prev_offset, | ||
}, | ||
start: curr_pos, | ||
end: curr_pos, | ||
}); | ||
|
||
// last offset not needed, because break at EOI is always available | ||
symbols | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
//! Contains matcher traits and types used to detect iterator end and strip prefixes. | ||
//! The available matcher traits are implemented for [`SymbolIterator`]. | ||
use std::rc::Rc; | ||
|
||
use itertools::{Itertools, PeekingNext}; | ||
|
||
use crate::scanner::SymbolKind; | ||
|
||
use super::SymbolIterator; | ||
|
||
/// Function type to notify an iterator if an end was reached. | ||
pub type IteratorEndFn = Rc<dyn (Fn(&mut dyn EndMatcher) -> bool)>; | ||
|
||
/// Function type to consume prefix sequences of a new line. | ||
pub type IteratorPrefixFn = Rc<dyn (Fn(&mut dyn PrefixMatcher) -> bool)>; | ||
|
||
/// Trait containing functions that are available inside the end matcher function. | ||
pub trait EndMatcher { | ||
/// Returns `true` if the upcoming [`Symbol`] sequence is an empty line. | ||
/// Meaning that a line contains no [`Symbol`] or only [`SymbolKind::Whitespace`]. | ||
/// | ||
/// **Note:** This is also `true` if a parent iterator stripped non-whitespace symbols, and the nested iterator only has whitespace symbols. | ||
/// | ||
/// [`Symbol`]: super::Symbol | ||
fn is_empty_line(&mut self) -> bool; | ||
|
||
/// Wrapper around [`Self::is_empty_line()`] that additionally consumes the matched empty line. | ||
/// Consuming means the related iterator advances over the matched empty line. | ||
/// | ||
/// **Note:** The iterator is only advanced if an empty line is matched. | ||
/// | ||
/// **Note:** The empty line is **not** included in the symbols returned by [`SymbolIterator::take_to_end()`]. | ||
fn consumed_is_empty_line(&mut self) -> bool; | ||
|
||
/// Returns `true` if the given [`Symbol`] sequence matches the upcoming one. | ||
/// | ||
/// [`Symbol`]: super::Symbol | ||
fn matches(&mut self, sequence: &[SymbolKind]) -> bool; | ||
|
||
/// Wrapper around [`Self::matches()`] that additionally consumes the matched sequence. | ||
/// Consuming means the related iterator advances over the matched sequence. | ||
/// | ||
/// **Note:** The iterator is only advanced if the sequence is matched. | ||
/// | ||
/// **Note:** The matched sequence is **not** included in the symbols returned by [`SymbolIterator::take_to_end()`]. | ||
fn consumed_matches(&mut self, sequence: &[SymbolKind]) -> bool; | ||
|
||
/// Returns `true` if the iterator is at the given nesting depth. | ||
/// | ||
/// **Note** Use [`SymbolIterator::curr_depth()`] to get the current depth of an iterator. | ||
fn at_depth(&self, depth: usize) -> bool; | ||
} | ||
|
||
/// Trait containing functions that are available inside the prefix matcher function. | ||
pub trait PrefixMatcher { | ||
/// Consumes and returns `true` if the given [`Symbol`] sequence matches the upcoming one. | ||
/// Consuming means the related iterator advances over the matched sequence. | ||
/// | ||
/// **Note:** The iterator is only advanced if the sequence is matched. | ||
/// | ||
/// **Note:** The given sequence must **not** include any [`SymbolKind::Newline`], because matches are only considered per line. | ||
/// | ||
/// **Note:** The matched sequence is **not** included in the symbols returned by [`SymbolIterator::take_to_end()`]. | ||
/// | ||
/// [`Symbol`]: super::Symbol | ||
fn consumed_prefix(&mut self, sequence: &[SymbolKind]) -> bool; | ||
} | ||
|
||
impl<'input> EndMatcher for SymbolIterator<'input> { | ||
fn is_empty_line(&mut self) -> bool { | ||
// Note: Multiple matches may be set in the match closure, so we need to ensure that all start at the same index | ||
self.reset_peek(); | ||
|
||
let next = self | ||
.peeking_next(|s| { | ||
matches!( | ||
s.kind, | ||
SymbolKind::Newline | SymbolKind::Blankline | SymbolKind::EOI | ||
) | ||
}) | ||
.map(|s| s.kind); | ||
|
||
let is_empty_line = if Some(SymbolKind::Newline) == next { | ||
let _whitespaces = self | ||
.peeking_take_while(|s| s.kind == SymbolKind::Whitespace) | ||
.count(); | ||
|
||
let new_line = self.peeking_next(|s| { | ||
matches!( | ||
s.kind, | ||
SymbolKind::Newline | SymbolKind::Blankline | SymbolKind::EOI | ||
) | ||
}); | ||
new_line.is_some() | ||
} else { | ||
next.is_some() | ||
}; | ||
|
||
is_empty_line | ||
} | ||
|
||
fn consumed_is_empty_line(&mut self) -> bool { | ||
let is_empty_line = self.is_empty_line(); | ||
|
||
if is_empty_line { | ||
self.set_index(self.peek_index()); // To consume peeked symbols | ||
} | ||
|
||
is_empty_line | ||
} | ||
|
||
fn matches(&mut self, sequence: &[SymbolKind]) -> bool { | ||
// Note: Multiple matches may be set in the match closure, so we need to ensure that all start at the same index | ||
self.reset_peek(); | ||
|
||
for kind in sequence { | ||
if self.peeking_next(|s| s.kind == *kind).is_none() { | ||
return false; | ||
} | ||
} | ||
|
||
true | ||
} | ||
|
||
fn consumed_matches(&mut self, sequence: &[SymbolKind]) -> bool { | ||
let matched = self.matches(sequence); | ||
|
||
if matched { | ||
self.set_index(self.peek_index()); // To consume peeked symbols | ||
} | ||
|
||
matched | ||
} | ||
|
||
fn at_depth(&self, depth: usize) -> bool { | ||
self.depth() == depth | ||
} | ||
} | ||
|
||
impl<'input> PrefixMatcher for SymbolIterator<'input> { | ||
fn consumed_prefix(&mut self, sequence: &[SymbolKind]) -> bool { | ||
debug_assert!( | ||
!sequence.contains(&SymbolKind::Newline), | ||
"Newline symbol in prefix match is not allowed." | ||
); | ||
|
||
self.consumed_matches(sequence) | ||
} | ||
} |
Oops, something went wrong.