feat: create SymbolIterator for block parsing (#106)

* feat: create SymbolIterator * feat: switch block parser to SymbolIterator * feat: add itertools for SymbolIterator * feat: switch to nesting symbol iterators * fix: add prefix line test for symbol iterator * feat: simplify iterator nesting parsers * fix: correct heading end closure to detect heading * fix: ignore newlines between elements * feat: make end-fn optional for new symbol iterator * fix: change end fns to get SymboliterMatcher * fix: remove new_line from SymbolIterRoot * fix: remove remaining symbols from tokenize output * fix: correct prefix consumption for symbol iterator * fix: fix endless loop in peeking_next() * fix: correct iterator length calculation * fix: prevent plain from merging with newline token * fix: implement rendering for whitespace inlines * fix: add comment why reset_peek() is needed * fix: update verbatim to work with symbol iterator * arch: split iterator into multiple files * fix: add documentation for the symbol iterator * feat: add nesting depth to symbol iterator * fix: add EOI symbol to match end as empty line * fix: remove EOI symbol for lexer tests * fix: pin zerovec crate to specific version * fix: resolve icu dependency problems check in lock file to prevent this in the future * feat: update icu to not need any generated data * fix: remove crate_authors!() due to clippy warning Behavior remains the same, because this was the default anyways. * chore: remove lock file from vc after icu bump * fix: add blankline for better readability Co-authored-by: Nadir Fejzić <[email protected]> * fix: use `debug_assert!()` instead of `cfg(debug_assertions)` Co-authored-by: Nadir Fejzić <[email protected]> * fix: make peeking_next() more compact * fix: use owned Vec to create Paragraph from * fix: use `iter::once()` to create end sequence Co-authored-by: Nadir Fejzić <[email protected]> * fix: remove double dot at end of sentence Co-authored-by: Nadir Fejzić <[email protected]> * fix: map length before unwrap of remaining_symbols Co-authored-by: Nadir Fejzić <[email protected]> * fix: improve comments for SymbolIterator * fix: remove Scanner struct Provide `scan_str()` as standalone function. * fix: restrict visibility of iterator index fns * fix: remove duplicate From<> impls for iterators * fix: remove *curr* prefix for iterator functions * fix: remove *curr* prefix from index in root iterator * fix: add assert to ensure update done on act parent Assert only in debug mode. --------- Co-authored-by: Nadir Fejzic <[email protected]>
unimarkup · Oct 2, 2023 · dd98ae2 · dd98ae2
1 parent 94148b3
commit dd98ae2
Show file tree

Hide file tree

Showing 19 changed files with 1,051 additions and 256 deletions.
diff --git a/commons/Cargo.toml b/commons/Cargo.toml
@@ -23,6 +23,7 @@ icu_segmenter = "1.3.0"
 icu_locid = "1.3.0"
 regex = { version = "1.8.1", optional = true }
 insta = { version = "1.29.0", features = ["serde"], optional = true }
+itertools = "0.11.0"
 
 [features]
 test_runner = ["dep:regex", "dep:once_cell", "dep:insta"]
diff --git a/commons/src/scanner/mod.rs b/commons/src/scanner/mod.rs
@@ -1,81 +1,74 @@
-//! Scanner and helper types and traits for structurization of Unimarkup input.
+//! Functionality, iterators, helper types and traits to get [`Symbol`]s from `&str`.
+//! These [`Symbol`]s and iterators are used to convert the input into a Unimarkup document.
+
+use icu_segmenter::GraphemeClusterSegmenter;
 
 pub mod position;
 pub mod span;
 mod symbol;
 
-use icu_segmenter::GraphemeClusterSegmenter;
-use position::{Offset, Position};
-pub use symbol::{Symbol, SymbolKind};
-
-#[derive(Debug)]
-pub struct Scanner {
-    segmenter: GraphemeClusterSegmenter,
-}
-
-impl Clone for Scanner {
-    fn clone(&self) -> Self {
-        let segmenter = GraphemeClusterSegmenter::new();
-
-        Self { segmenter }
-    }
-}
-
-impl Default for Scanner {
-    fn default() -> Self {
-        let segmenter = GraphemeClusterSegmenter::new();
+use position::{Offset, Position as SymPos};
+pub use symbol::{iterator::*, Symbol, SymbolKind};
 
-        Self { segmenter }
-    }
-}
-
-impl Scanner {
-    pub fn scan_str<'s>(&self, input: &'s str) -> Vec<Symbol<'s>> {
-        let mut symbols: Vec<Symbol> = Vec::new();
-        let mut curr_pos: Position = Position::default();
-        let mut prev_offset = 0;
+/// Scans given input and returns vector of [`Symbol`]s needed to convert the input to Unimarkup content.
+pub fn scan_str(input: &str) -> Vec<Symbol<'_>> {
+    let segmenter = GraphemeClusterSegmenter::new();
 
-        // skip(1) to ignore break at start of input
-        for offset in self.segmenter.segment_str(input).skip(1) {
-            if let Some(grapheme) = input.get(prev_offset..offset) {
-                let mut kind = SymbolKind::from(grapheme);
+    let mut symbols: Vec<Symbol> = Vec::new();
+    let mut curr_pos: SymPos = SymPos::default();
+    let mut prev_offset = 0;
 
-                let end_pos = if kind == SymbolKind::Newline {
-                    Position {
-                        line: (curr_pos.line + 1),
-                        ..Default::default()
-                    }
-                } else {
-                    Position {
-                        line: curr_pos.line,
-                        col_utf8: (curr_pos.col_utf8 + grapheme.len()),
-                        col_utf16: (curr_pos.col_utf16 + grapheme.encode_utf16().count()),
-                        col_grapheme: (curr_pos.col_grapheme + 1),
-                    }
-                };
+    // skip(1) to ignore break at start of input
+    for offset in segmenter.segment_str(input).skip(1) {
+        if let Some(grapheme) = input.get(prev_offset..offset) {
+            let mut kind = SymbolKind::from(grapheme);
 
-                if curr_pos.col_utf8 == 1 && kind == SymbolKind::Newline {
-                    // newline at the start of line -> Blankline
-                    kind = SymbolKind::Blankline;
+            let end_pos = if kind == SymbolKind::Newline {
+                SymPos {
+                    line: (curr_pos.line + 1),
+                    ..Default::default()
                 }
+            } else {
+                SymPos {
+                    line: curr_pos.line,
+                    col_utf8: (curr_pos.col_utf8 + grapheme.len()),
+                    col_utf16: (curr_pos.col_utf16 + grapheme.encode_utf16().count()),
+                    col_grapheme: (curr_pos.col_grapheme + 1),
+                }
+            };
 
-                symbols.push(Symbol {
-                    input,
-                    kind,
-                    offset: Offset {
-                        start: prev_offset,
-                        end: offset,
-                    },
-                    start: curr_pos,
-                    end: end_pos,
-                });
-
-                curr_pos = end_pos;
+            if curr_pos.col_utf8 == 1 && kind == SymbolKind::Newline {
+                // newline at the start of line -> Blankline
+                kind = SymbolKind::Blankline;
             }
-            prev_offset = offset;
-        }
 
-        // last offset not needed, because break at EOI is always available
-        symbols
+            symbols.push(Symbol {
+                input,
+                kind,
+                offset: Offset {
+                    start: prev_offset,
+                    end: offset,
+                },
+                start: curr_pos,
+                end: end_pos,
+            });
+
+            curr_pos = end_pos;
+        }
+        prev_offset = offset;
     }
+
+    symbols.push(Symbol {
+        input,
+        kind: SymbolKind::EOI,
+        offset: Offset {
+            start: prev_offset,
+            end: prev_offset,
+        },
+        start: curr_pos,
+        end: curr_pos,
+    });
+
+    // last offset not needed, because break at EOI is always available
+    symbols
 }
diff --git a/commons/src/scanner/symbol/iterator/matcher.rs b/commons/src/scanner/symbol/iterator/matcher.rs
@@ -0,0 +1,150 @@
+//! Contains matcher traits and types used to detect iterator end and strip prefixes.
+//! The available matcher traits are implemented for [`SymbolIterator`].
+
+use std::rc::Rc;
+
+use itertools::{Itertools, PeekingNext};
+
+use crate::scanner::SymbolKind;
+
+use super::SymbolIterator;
+
+/// Function type to notify an iterator if an end was reached.
+pub type IteratorEndFn = Rc<dyn (Fn(&mut dyn EndMatcher) -> bool)>;
+
+/// Function type to consume prefix sequences of a new line.
+pub type IteratorPrefixFn = Rc<dyn (Fn(&mut dyn PrefixMatcher) -> bool)>;
+
+/// Trait containing functions that are available inside the end matcher function.
+pub trait EndMatcher {
+    /// Returns `true` if the upcoming [`Symbol`] sequence is an empty line.
+    /// Meaning that a line contains no [`Symbol`] or only [`SymbolKind::Whitespace`].
+    ///
+    /// **Note:** This is also `true` if a parent iterator stripped non-whitespace symbols, and the nested iterator only has whitespace symbols.
+    ///
+    /// [`Symbol`]: super::Symbol
+    fn is_empty_line(&mut self) -> bool;
+
+    /// Wrapper around [`Self::is_empty_line()`] that additionally consumes the matched empty line.
+    /// Consuming means the related iterator advances over the matched empty line.
+    ///
+    /// **Note:** The iterator is only advanced if an empty line is matched.
+    ///
+    /// **Note:** The empty line is **not** included in the symbols returned by [`SymbolIterator::take_to_end()`].
+    fn consumed_is_empty_line(&mut self) -> bool;
+
+    /// Returns `true` if the given [`Symbol`] sequence matches the upcoming one.
+    ///
+    /// [`Symbol`]: super::Symbol
+    fn matches(&mut self, sequence: &[SymbolKind]) -> bool;
+
+    /// Wrapper around [`Self::matches()`] that additionally consumes the matched sequence.
+    /// Consuming means the related iterator advances over the matched sequence.
+    ///
+    /// **Note:** The iterator is only advanced if the sequence is matched.
+    ///
+    /// **Note:** The matched sequence is **not** included in the symbols returned by [`SymbolIterator::take_to_end()`].
+    fn consumed_matches(&mut self, sequence: &[SymbolKind]) -> bool;
+
+    /// Returns `true` if the iterator is at the given nesting depth.
+    ///
+    /// **Note** Use [`SymbolIterator::curr_depth()`] to get the current depth of an iterator.
+    fn at_depth(&self, depth: usize) -> bool;
+}
+
+/// Trait containing functions that are available inside the prefix matcher function.
+pub trait PrefixMatcher {
+    /// Consumes and returns `true` if the given [`Symbol`] sequence matches the upcoming one.
+    /// Consuming means the related iterator advances over the matched sequence.
+    ///
+    /// **Note:** The iterator is only advanced if the sequence is matched.
+    ///
+    /// **Note:** The given sequence must **not** include any [`SymbolKind::Newline`], because matches are only considered per line.
+    ///
+    /// **Note:** The matched sequence is **not** included in the symbols returned by [`SymbolIterator::take_to_end()`].
+    ///
+    /// [`Symbol`]: super::Symbol
+    fn consumed_prefix(&mut self, sequence: &[SymbolKind]) -> bool;
+}
+
+impl<'input> EndMatcher for SymbolIterator<'input> {
+    fn is_empty_line(&mut self) -> bool {
+        // Note: Multiple matches may be set in the match closure, so we need to ensure that all start at the same index
+        self.reset_peek();
+
+        let next = self
+            .peeking_next(|s| {
+                matches!(
+                    s.kind,
+                    SymbolKind::Newline | SymbolKind::Blankline | SymbolKind::EOI
+                )
+            })
+            .map(|s| s.kind);
+
+        let is_empty_line = if Some(SymbolKind::Newline) == next {
+            let _whitespaces = self
+                .peeking_take_while(|s| s.kind == SymbolKind::Whitespace)
+                .count();
+
+            let new_line = self.peeking_next(|s| {
+                matches!(
+                    s.kind,
+                    SymbolKind::Newline | SymbolKind::Blankline | SymbolKind::EOI
+                )
+            });
+            new_line.is_some()
+        } else {
+            next.is_some()
+        };
+
+        is_empty_line
+    }
+
+    fn consumed_is_empty_line(&mut self) -> bool {
+        let is_empty_line = self.is_empty_line();
+
+        if is_empty_line {
+            self.set_index(self.peek_index()); // To consume peeked symbols
+        }
+
+        is_empty_line
+    }
+
+    fn matches(&mut self, sequence: &[SymbolKind]) -> bool {
+        // Note: Multiple matches may be set in the match closure, so we need to ensure that all start at the same index
+        self.reset_peek();
+
+        for kind in sequence {
+            if self.peeking_next(|s| s.kind == *kind).is_none() {
+                return false;
+            }
+        }
+
+        true
+    }
+
+    fn consumed_matches(&mut self, sequence: &[SymbolKind]) -> bool {
+        let matched = self.matches(sequence);
+
+        if matched {
+            self.set_index(self.peek_index()); // To consume peeked symbols
+        }
+
+        matched
+    }
+
+    fn at_depth(&self, depth: usize) -> bool {
+        self.depth() == depth
+    }
+}
+
+impl<'input> PrefixMatcher for SymbolIterator<'input> {
+    fn consumed_prefix(&mut self, sequence: &[SymbolKind]) -> bool {
+        debug_assert!(
+            !sequence.contains(&SymbolKind::Newline),
+            "Newline symbol in prefix match is not allowed."
+        );
+
+        self.consumed_matches(sequence)
+    }
+}