Skip to content

Commit

Permalink
Merge pull request #1 from savi-lang/add/initial-code
Browse files Browse the repository at this point in the history
Initial commit of working PEG parser library.
  • Loading branch information
jemc authored Sep 16, 2022
2 parents e67e1df + bbfd361 commit 5dd1a5f
Show file tree
Hide file tree
Showing 22 changed files with 905 additions and 4 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
A base repository for Savi language libraries, with common CI actions configured.
# PEG

See the [Guide](https://github.com/savi-lang/base-standard-library/wiki/Guide) for details on how it works and how to use it for your own libraries.
Parsing Expression Grammars for the Savi standard library.
59 changes: 57 additions & 2 deletions spec/PEG.Spec.savi
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,60 @@
:is Spec
:const describes: "PEG"

:it "has a placeholder method for demonstrating testing"
assert: PEG.placeholder == True
:it "can parse JSON content using a JSON grammar"
parser = PEG.Parser(_JSON.Token, String).new(
_JSON.Grammar.new
_JSON.TokenPrinter.new
)

// TODO: Use one big string once the Savi formatter gets fixed to not complain.
assert: parser.parse!(String.join([
<<<{>>>
<<< "Hello": "World!",>>>
<<< "from": {>>>
<<< "name": "savi-lang/PEG",>>>
<<< "easy-as": [1, 2, 3],>>>
<<< "nifty": true,>>>
<<< "overcomplicated": false,>>>
<<< "worse-than": null,>>>
<<< "problems": [],>>>
<<< "utf8": ["Д", "Ⴃ", "𐀀"]>>>
<<< }>>>
<<<}>>>
], "\n")) == String.join([
"0-222: _JSON.Token.Object" // {
"4-21: _JSON.Token.Pair"
"5-10: _JSON.Token.String" // "Hello":
"14-20: _JSON.Token.String" // "World!",
"25-220: _JSON.Token.Pair"
"26-30: _JSON.Token.String" // "from":
"33-220: _JSON.Token.Object" // {
"39-62: _JSON.Token.Pair"
"40-44: _JSON.Token.String" // "name":
"48-61: _JSON.Token.String" // "savi-lang/PEG",
"68-88: _JSON.Token.Pair"
"69-76: _JSON.Token.String" // "easy-as":
"79-88: _JSON.Token.Array" // [
"80-81: _JSON.Token.Number" // 1,
"83-84: _JSON.Token.Number" // 2,
"86-87: _JSON.Token.Number" // 3],
"94-107: _JSON.Token.Pair"
"95-100: _JSON.Token.String" // "nifty":
"103-107: _JSON.Token.True" // true,
"113-137: _JSON.Token.Pair"
"114-129: _JSON.Token.String" // "overcomplicated":
"132-137: _JSON.Token.False" // false,
"143-161: _JSON.Token.Pair"
"144-154: _JSON.Token.String" // "worse-than":
"157-161: _JSON.Token.Null" // null,
"167-181: _JSON.Token.Pair"
"168-176: _JSON.Token.String" // "problems":
"179-181: _JSON.Token.Array" // [],
"187-216: _JSON.Token.Pair"
"188-192: _JSON.Token.String" // "utf8":
"195-216: _JSON.Token.Array" // [
"197-199: _JSON.Token.String" // (string with a 2-byte character),
"203-206: _JSON.Token.String" // (string with a 3-byte character),
"210-214: _JSON.Token.String" // (string with a 4-byte character)]
"" // }}
], "\n")
116 changes: 116 additions & 0 deletions spec/_JSON.savi
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
// In this file we implement a simple JSON parser as a PEG grammar for testing,
// not because we think it's a great idea to use PEG to parse JSON, but because
// JSON is a very simple grammar and many people are very familiar with it.
// Also, it has many common constructs (numbers, strings, arrays, etc.) that
// people might want to use in the actual language they're intending to parse,
// so it serves as a good real-world example of how to use this library,
// and how Parsing Expression Grammars work in general.

// This type declares the different token kinds that we will parse.
// You don't necessarily need to use an enum for this, but it's a solid choice
// to do so, because they are represented very efficiently at runtime.
// In theory you can use any `val` that implements `IntoString` for printing.
:enum _JSON.Token
:member Null 0
:member True 1
:member False 2
:member Number 3
:member String 4
:member Pair 5
:member Object 6
:member Array 7

// Here we declare the grammar itself, which describes what patterns to check
// for in the language and how to tag them with tokens when we find them.
// We use `PEG.DSL` here to instantiate patterns succinctly, using `@`-calls
// to create new patterns from scratch, method calls to create outer patterns
// from an inner pattern, and operator sugar to compose patterns together.
:class val _JSON.Grammar
:is PEG.Grammar(_JSON.Token)
:new val
// Define what whitespace looks like.
whitespace = @char(' ') / @char('\t') / @char('\r') / @char('\n')
s = whitespace.repeat

// The root of a JSON document is an array or an object,
// optionally surrounded by some whitespace.
array = @declare
object = @declare
@root = (s >> (array / object) >> s).then_eof

// Define what a number looks like.
digit19 = @range('1', '9')
digit = @range('0', '9')
digits = digit.repeat(1)
int = (
(@char('-') >> digit19 >> digit.repeat)
/ (@char('-') >> digit)
/ (digit19 >> digit.repeat)
/ digit
)
frac = @char('.') >> digits
exp = (@char('e') / @char('E')) >> (@char('+') / @char('-')).maybe >> digits
number = (
int >> frac.maybe >> exp.maybe
).token(_JSON.Token.Number)

// Define what a string looks like.
hex = digit / @range('a', 'f') / @range('A', 'F')
string_char = (
@str("\\\"") / @str("\\\\") / @str("\\/")
/ @str("\\b") / @str("\\f") / @str("\\n") / @str("\\r") / @str("\\t")
/ (@str("\\u") >> hex >> hex >> hex >> hex)
/ (@char('"').not >> @char('\\').not >> @any)
)
string = (
@char('"')
>> string_char.repeat.token(_JSON.Token.String)
>> @char('"')
)

// Define what constitutes a value.
value = (
@str("null").token(_JSON.Token.Null)
/ @str("true").token(_JSON.Token.True)
/ @str("false").token(_JSON.Token.False)
/ number / string
/ array / object
)

// Define what an array is, in terms of zero or more values.
values = value >> s >> (@char(',') >> s >> value >> s).repeat
array.define(
(
@char('[')
>> s >> values.maybe
>> s >> @char(']')
).token(_JSON.Token.Array)
)

// Define what an object is, in terms of zero or more key/value pairs.
pair = (string >> s >> @char(':') >> s >> value).token(_JSON.Token.Pair)
pairs = pair >> s >> (@char(',') >> s >> pair >> s).repeat
object.define(
(
@char('{')
>> s >> pairs.maybe
>> s >> @char('}')
).token(_JSON.Token.Object)
)

// Finally, we also need to declare a builder that can assemble the stream
// of tokens into some output type. In this case our output type will be
// a simple string, printing the offset range and kind of each token.
//
// In a real-world use case, instead of outputting a `String`, you'd likely
// output an AST data structure of some kind instead, or if you're building
// a very simple interpreter, perhaps you could directly build the result value.
:class _JSON.TokenPrinter
:is PEG.Parser.Builder(_JSON.Token, String)

:fun ref build(tokens Array(PEG.Token(_JSON.Token))'val) String
out = String.new_iso
tokens.each -> (token |
out << "\(token.start)-\(token.end): \(token.kind)\n"
)
--out
71 changes: 71 additions & 0 deletions src/PEG.DSL.savi
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
:: A trait of convenience methods for creating patterns.
::
:: This DSL is meant for use in composing a `PEG.Grammar`.
:trait PEG.DSL(T PEG.Token.Kind)
:: Introduce a forward-declaration of a pattern to be defined later.
:: This allows patterns to be a recursive/cyclical graph rather than acyclic.
:fun non declare: PEG.Pattern.Forward(T).new

:: Check for any Unicode codepoint, failing at end of input or invalid data.
:fun non any: PEG.Pattern.UnicodeAny(T)

:: Check for exact matches with the given literal string.
:: As an optimization, returns `PEG.Pattern.Byte` if the string is one byte.
:fun non str(text String)
try (
if (text.size == 1) (
pattern = PEG.Pattern.Byte(T).new(text.byte_at!(0))
return pattern // TODO: make these two lines a single line
)
)
PEG.Pattern.Literal(T).new(text)

:: Check for exact matches with the given Unicode codepoint.
:: As an optimization, returns `PEG.Pattern.Byte` if the string is one byte.
:fun non char(codepoint U32)
if (codepoint < 0x80) (
return PEG.Pattern.Byte(T).new(codepoint.u8)
)
PEG.Pattern.Literal(T).new("\(codepoint.format.unicode)")

:: Check for a given range of Unicode codepoints, inclusive of both ends.
:fun non range(min, max): PEG.Pattern.UnicodeRange(T).new(min, max)

:: A trait of convenience methods/operators for composing patterns based
:: on a given base pattern (the receiver of the method call).
::
:: This DSL is meant for use in composing a `PEG.Grammar`.
:trait PEG.DSL.Methods(T PEG.Token.Kind)
// TODO: Remove the need for this duplication with `PEG.Pattern`, without
// causing the compiler to enter an infinite recursion of trait copying.
:fun description String
:is IntoString
:fun val check_match(source String, offset USize, state _State(T)) _Result(T)
:fun val match(source String, offset USize, state _State(T)) _Result(T)
// TODO: Remove the above lines ^

:: Chain the given pattern after this one, in a sequence.
:fun ref ">>"(other): PEG.Pattern.Sequence(T).new([@, other])

:: Allow an ordered choice between this pattern and the given other one,
:: instructing the parser to try to match the other one if this one fails.
:fun ref "/"(other): PEG.Pattern.Choice(T).new([@, other])

:: Use this pattern for "negative lookahead", checking what's ahead
:: to ensure it doesn't match the pattern, without actually consuming it.
:: Chain this method twice to get "positive lookahead", checking the opposite.
:fun ref not: PEG.Pattern.Not(T).new(@)
// TODO: :fun ref "~": PEG.Pattern.Not(T).new(@)

:: Check for this pattern to repeat at least the given number of times,
:: consuming it as many times as needed until it no longer matches.
:fun ref repeat(min USize = 0): PEG.Pattern.Repeat(T).new(@, min)

:: Allow this pattern to be optional, avoiding failure if it doesn't match.
:fun ref maybe: PEG.Pattern.Optional(T).new(@)

:: Follow this pattern by checking to ensure that the end of input is reached.
:fun ref then_eof: PEG.Pattern.EOF(T).new(@)

:: When this pattern is matched, mark a token for the content it matched.
:fun ref token(kind T): PEG.Pattern.Tokenize(T).new(@, kind)
4 changes: 4 additions & 0 deletions src/PEG.Grammar.savi
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
:trait val PEG.Grammar(T PEG.Token.Kind)
:copies PEG.DSL(T)

:let root PEG.Pattern(T)
24 changes: 24 additions & 0 deletions src/PEG.Parser.savi
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
:trait PEG.Parser.Builder(T PEG.Token.Kind, O Any'any)
:fun ref build(tokens Array(PEG.Token(T))'val) O
:fun ref reset: None

:class PEG.Parser(T PEG.Token.Kind, O Any'any)
:let grammar PEG.Grammar(T)
:let builder PEG.Parser.Builder(T, O)
:new (@grammar, @builder)

:var last_parse_byte_size USize: 0
:var last_parse_fail_pattern (PEG.Pattern(T)'val | None): None

:fun ref parse!(source, offset = 0) O
@builder.reset

result = @grammar.root.match(source, offset, _State(T).new)
@last_parse_byte_size = result.length

try (
@builder.build(result.success_tokens!)
|
@last_parse_fail_pattern = result.fail_pattern!
error!
)
32 changes: 32 additions & 0 deletions src/PEG.Pattern.Byte.savi
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
:: `PEG.Pattern.Byte` is used to consume a single specific byte.
::
:: Parsing will fail if the next byte in the string doesn't match that byte.
:: Otherwise, the pattern succeeds, consuming the matched bytes.
:class PEG.Pattern.Byte(T PEG.Token.Kind)
:is PEG.Pattern(T)

:let expected U8
:new (@expected)

:fun description: "\(@expected.format.unicode)"

:is IntoString
:fun into_string_space USize
"char('')".size + @expected.format.printable_ascii.into_string_space
:fun into_string(out String'iso)
out << "char('"
out = @expected.format.printable_ascii.into_string(--out)
out << "')"
--out

:fun val check_match(
source String
offset USize
state _State(T)
) _Result(T)
try (
error! if (source.byte_at!(offset) != @expected)
_Result(T).success(1)
|
_Result(T).fail(0, @)
)
74 changes: 74 additions & 0 deletions src/PEG.Pattern.Choice.savi
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
:: `PEG.Pattern.Choice` is used to specify an ordered choice of patterns.
::
:: Returns the result of the first child pattern that matched.
:: Returns the longest-length failure of all child patterns fail.
:class PEG.Pattern.Choice(T PEG.Token.Kind)
:is PEG.Pattern(T)

:let children Array(PEG.Pattern(T))
:new (@children)

:: Override this DSL operator to accrue into the existing choice.
:fun "/"(other PEG.Pattern(T))
children ref = @children.clone
children << other
@new(--children)

:fun description
if (@children.size < 2) (
try (@children[0]!.description | "empty choice!")
|
out = String.new_iso
@children.each_with_index -> (child, index |
if index.is_zero (
out << "either "
out << child.description
|
out << ", or "
out << child.description
)
)
--out
)

:is IntoString
:fun into_string_space USize
space = (@children.size - 1) * 3 + 2
@children.each -> (child | space += child.into_string_space)
space
:fun into_string(out String'iso) String'iso
out << "("
@children.each_with_index -> (child, index |
if index.is_nonzero (out << " / ")
out = child.into_string(--out)
)
out << ")"
--out

:fun val check_match(
source String
offset USize
state _State(T)
) _Result(T)
fail_result = _Result(T).fail(0, @)

// Try each child pattern in order, looking for the first successful match.
@children.each_with_index -> (child, child_index |
result = child.match(source, offset, state)

// On first success, return the result.
if result.is_success (
return result
)

// On failure, record the info if this is the longest failure yet seen.
if (result.length > fail_result.length) (
fail_result = result
)
)

state.observe_fail(
offset + fail_result.length
try (fail_result.fail_pattern! | @)
)
fail_result
Loading

0 comments on commit 5dd1a5f

Please sign in to comment.