Skip to content

Commit

Permalink
feat: create SymbolIterator for block parsing (#106)
Browse files Browse the repository at this point in the history
* feat: create SymbolIterator

* feat: switch block parser to SymbolIterator

* feat: add itertools for SymbolIterator

* feat: switch to nesting symbol iterators

* fix: add prefix line test for symbol iterator

* feat: simplify iterator nesting parsers

* fix: correct heading end closure to detect heading

* fix: ignore newlines between elements

* feat: make end-fn optional for new symbol iterator

* fix: change end fns to get SymboliterMatcher

* fix: remove new_line from SymbolIterRoot

* fix: remove remaining symbols from tokenize output

* fix: correct prefix consumption for symbol iterator

* fix: fix endless loop in peeking_next()

* fix: correct iterator length calculation

* fix: prevent plain from merging with newline token

* fix: implement rendering for whitespace inlines

* fix: add comment why reset_peek() is needed

* fix: update verbatim to work with symbol iterator

* arch: split iterator into multiple files

* fix: add documentation for the symbol iterator

* feat: add nesting depth to symbol iterator

* fix: add EOI symbol to match end as empty line

* fix: remove EOI symbol for lexer tests

* fix: pin zerovec crate to specific version

* fix: resolve icu dependency problems

check in lock file to prevent this in the future

* feat: update icu to not need any generated data

* fix: remove crate_authors!() due to clippy warning

Behavior remains the same, because this was the default anyways.

* chore: remove lock file from vc after icu bump

* fix: add blankline for better readability

Co-authored-by: Nadir Fejzić <[email protected]>

* fix: use `debug_assert!()` instead of `cfg(debug_assertions)`

Co-authored-by: Nadir Fejzić <[email protected]>

* fix: make peeking_next() more compact

* fix: use owned Vec to create Paragraph from

* fix: use `iter::once()` to create end sequence

Co-authored-by: Nadir Fejzić <[email protected]>

* fix: remove double dot at end of sentence

Co-authored-by: Nadir Fejzić <[email protected]>

* fix: map length before unwrap of remaining_symbols

Co-authored-by: Nadir Fejzić <[email protected]>

* fix: improve comments for SymbolIterator

* fix: remove Scanner struct

Provide `scan_str()` as standalone function.

* fix: restrict visibility of iterator index fns

* fix: remove duplicate From<> impls for iterators

* fix: remove *curr* prefix for iterator functions

* fix: remove *curr* prefix from index in root iterator

* fix: add assert to ensure update done on act parent

Assert only in debug mode.

---------

Co-authored-by: Nadir Fejzic <[email protected]>
  • Loading branch information
mhatzl and nfejzic authored Oct 2, 2023
1 parent 94148b3 commit dd98ae2
Show file tree
Hide file tree
Showing 19 changed files with 1,051 additions and 256 deletions.
1 change: 1 addition & 0 deletions commons/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ icu_segmenter = "1.3.0"
icu_locid = "1.3.0"
regex = { version = "1.8.1", optional = true }
insta = { version = "1.29.0", features = ["serde"], optional = true }
itertools = "0.11.0"

[features]
test_runner = ["dep:regex", "dep:once_cell", "dep:insta"]
125 changes: 59 additions & 66 deletions commons/src/scanner/mod.rs
Original file line number Diff line number Diff line change
@@ -1,81 +1,74 @@
//! Scanner and helper types and traits for structurization of Unimarkup input.
//! Functionality, iterators, helper types and traits to get [`Symbol`]s from `&str`.
//! These [`Symbol`]s and iterators are used to convert the input into a Unimarkup document.
use icu_segmenter::GraphemeClusterSegmenter;

pub mod position;
pub mod span;
mod symbol;

use icu_segmenter::GraphemeClusterSegmenter;
use position::{Offset, Position};
pub use symbol::{Symbol, SymbolKind};

#[derive(Debug)]
pub struct Scanner {
segmenter: GraphemeClusterSegmenter,
}

impl Clone for Scanner {
fn clone(&self) -> Self {
let segmenter = GraphemeClusterSegmenter::new();

Self { segmenter }
}
}

impl Default for Scanner {
fn default() -> Self {
let segmenter = GraphemeClusterSegmenter::new();
use position::{Offset, Position as SymPos};
pub use symbol::{iterator::*, Symbol, SymbolKind};

Self { segmenter }
}
}

impl Scanner {
pub fn scan_str<'s>(&self, input: &'s str) -> Vec<Symbol<'s>> {
let mut symbols: Vec<Symbol> = Vec::new();
let mut curr_pos: Position = Position::default();
let mut prev_offset = 0;
/// Scans given input and returns vector of [`Symbol`]s needed to convert the input to Unimarkup content.
pub fn scan_str(input: &str) -> Vec<Symbol<'_>> {
let segmenter = GraphemeClusterSegmenter::new();

// skip(1) to ignore break at start of input
for offset in self.segmenter.segment_str(input).skip(1) {
if let Some(grapheme) = input.get(prev_offset..offset) {
let mut kind = SymbolKind::from(grapheme);
let mut symbols: Vec<Symbol> = Vec::new();
let mut curr_pos: SymPos = SymPos::default();
let mut prev_offset = 0;

let end_pos = if kind == SymbolKind::Newline {
Position {
line: (curr_pos.line + 1),
..Default::default()
}
} else {
Position {
line: curr_pos.line,
col_utf8: (curr_pos.col_utf8 + grapheme.len()),
col_utf16: (curr_pos.col_utf16 + grapheme.encode_utf16().count()),
col_grapheme: (curr_pos.col_grapheme + 1),
}
};
// skip(1) to ignore break at start of input
for offset in segmenter.segment_str(input).skip(1) {
if let Some(grapheme) = input.get(prev_offset..offset) {
let mut kind = SymbolKind::from(grapheme);

if curr_pos.col_utf8 == 1 && kind == SymbolKind::Newline {
// newline at the start of line -> Blankline
kind = SymbolKind::Blankline;
let end_pos = if kind == SymbolKind::Newline {
SymPos {
line: (curr_pos.line + 1),
..Default::default()
}
} else {
SymPos {
line: curr_pos.line,
col_utf8: (curr_pos.col_utf8 + grapheme.len()),
col_utf16: (curr_pos.col_utf16 + grapheme.encode_utf16().count()),
col_grapheme: (curr_pos.col_grapheme + 1),
}
};

symbols.push(Symbol {
input,
kind,
offset: Offset {
start: prev_offset,
end: offset,
},
start: curr_pos,
end: end_pos,
});

curr_pos = end_pos;
if curr_pos.col_utf8 == 1 && kind == SymbolKind::Newline {
// newline at the start of line -> Blankline
kind = SymbolKind::Blankline;
}
prev_offset = offset;
}

// last offset not needed, because break at EOI is always available
symbols
symbols.push(Symbol {
input,
kind,
offset: Offset {
start: prev_offset,
end: offset,
},
start: curr_pos,
end: end_pos,
});

curr_pos = end_pos;
}
prev_offset = offset;
}

symbols.push(Symbol {
input,
kind: SymbolKind::EOI,
offset: Offset {
start: prev_offset,
end: prev_offset,
},
start: curr_pos,
end: curr_pos,
});

// last offset not needed, because break at EOI is always available
symbols
}
150 changes: 150 additions & 0 deletions commons/src/scanner/symbol/iterator/matcher.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
//! Contains matcher traits and types used to detect iterator end and strip prefixes.
//! The available matcher traits are implemented for [`SymbolIterator`].
use std::rc::Rc;

use itertools::{Itertools, PeekingNext};

use crate::scanner::SymbolKind;

use super::SymbolIterator;

/// Function type to notify an iterator if an end was reached.
pub type IteratorEndFn = Rc<dyn (Fn(&mut dyn EndMatcher) -> bool)>;

/// Function type to consume prefix sequences of a new line.
pub type IteratorPrefixFn = Rc<dyn (Fn(&mut dyn PrefixMatcher) -> bool)>;

/// Trait containing functions that are available inside the end matcher function.
pub trait EndMatcher {
/// Returns `true` if the upcoming [`Symbol`] sequence is an empty line.
/// Meaning that a line contains no [`Symbol`] or only [`SymbolKind::Whitespace`].
///
/// **Note:** This is also `true` if a parent iterator stripped non-whitespace symbols, and the nested iterator only has whitespace symbols.
///
/// [`Symbol`]: super::Symbol
fn is_empty_line(&mut self) -> bool;

/// Wrapper around [`Self::is_empty_line()`] that additionally consumes the matched empty line.
/// Consuming means the related iterator advances over the matched empty line.
///
/// **Note:** The iterator is only advanced if an empty line is matched.
///
/// **Note:** The empty line is **not** included in the symbols returned by [`SymbolIterator::take_to_end()`].
fn consumed_is_empty_line(&mut self) -> bool;

/// Returns `true` if the given [`Symbol`] sequence matches the upcoming one.
///
/// [`Symbol`]: super::Symbol
fn matches(&mut self, sequence: &[SymbolKind]) -> bool;

/// Wrapper around [`Self::matches()`] that additionally consumes the matched sequence.
/// Consuming means the related iterator advances over the matched sequence.
///
/// **Note:** The iterator is only advanced if the sequence is matched.
///
/// **Note:** The matched sequence is **not** included in the symbols returned by [`SymbolIterator::take_to_end()`].
fn consumed_matches(&mut self, sequence: &[SymbolKind]) -> bool;

/// Returns `true` if the iterator is at the given nesting depth.
///
/// **Note** Use [`SymbolIterator::curr_depth()`] to get the current depth of an iterator.
fn at_depth(&self, depth: usize) -> bool;
}

/// Trait containing functions that are available inside the prefix matcher function.
pub trait PrefixMatcher {
/// Consumes and returns `true` if the given [`Symbol`] sequence matches the upcoming one.
/// Consuming means the related iterator advances over the matched sequence.
///
/// **Note:** The iterator is only advanced if the sequence is matched.
///
/// **Note:** The given sequence must **not** include any [`SymbolKind::Newline`], because matches are only considered per line.
///
/// **Note:** The matched sequence is **not** included in the symbols returned by [`SymbolIterator::take_to_end()`].
///
/// [`Symbol`]: super::Symbol
fn consumed_prefix(&mut self, sequence: &[SymbolKind]) -> bool;
}

impl<'input> EndMatcher for SymbolIterator<'input> {
fn is_empty_line(&mut self) -> bool {
// Note: Multiple matches may be set in the match closure, so we need to ensure that all start at the same index
self.reset_peek();

let next = self
.peeking_next(|s| {
matches!(
s.kind,
SymbolKind::Newline | SymbolKind::Blankline | SymbolKind::EOI
)
})
.map(|s| s.kind);

let is_empty_line = if Some(SymbolKind::Newline) == next {
let _whitespaces = self
.peeking_take_while(|s| s.kind == SymbolKind::Whitespace)
.count();

let new_line = self.peeking_next(|s| {
matches!(
s.kind,
SymbolKind::Newline | SymbolKind::Blankline | SymbolKind::EOI
)
});
new_line.is_some()
} else {
next.is_some()
};

is_empty_line
}

fn consumed_is_empty_line(&mut self) -> bool {
let is_empty_line = self.is_empty_line();

if is_empty_line {
self.set_index(self.peek_index()); // To consume peeked symbols
}

is_empty_line
}

fn matches(&mut self, sequence: &[SymbolKind]) -> bool {
// Note: Multiple matches may be set in the match closure, so we need to ensure that all start at the same index
self.reset_peek();

for kind in sequence {
if self.peeking_next(|s| s.kind == *kind).is_none() {
return false;
}
}

true
}

fn consumed_matches(&mut self, sequence: &[SymbolKind]) -> bool {
let matched = self.matches(sequence);

if matched {
self.set_index(self.peek_index()); // To consume peeked symbols
}

matched
}

fn at_depth(&self, depth: usize) -> bool {
self.depth() == depth
}
}

impl<'input> PrefixMatcher for SymbolIterator<'input> {
fn consumed_prefix(&mut self, sequence: &[SymbolKind]) -> bool {
debug_assert!(
!sequence.contains(&SymbolKind::Newline),
"Newline symbol in prefix match is not allowed."
);

self.consumed_matches(sequence)
}
}
Loading

0 comments on commit dd98ae2

Please sign in to comment.