-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from savi-lang/add/initial-code
Initial commit of working PEG parser library.
- Loading branch information
Showing
22 changed files
with
905 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
A base repository for Savi language libraries, with common CI actions configured. | ||
# PEG | ||
|
||
See the [Guide](https://github.com/savi-lang/base-standard-library/wiki/Guide) for details on how it works and how to use it for your own libraries. | ||
Parsing Expression Grammars for the Savi standard library. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
// In this file we implement a simple JSON parser as a PEG grammar for testing, | ||
// not because we think it's a great idea to use PEG to parse JSON, but because | ||
// JSON is a very simple grammar and many people are very familiar with it. | ||
// Also, it has many common constructs (numbers, strings, arrays, etc.) that | ||
// people might want to use in the actual language they're intending to parse, | ||
// so it serves as a good real-world example of how to use this library, | ||
// and how Parsing Expression Grammars work in general. | ||
|
||
// This type declares the different token kinds that we will parse. | ||
// You don't necessarily need to use an enum for this, but it's a solid choice | ||
// to do so, because they are represented very efficiently at runtime. | ||
// In theory you can use any `val` that implements `IntoString` for printing. | ||
:enum _JSON.Token | ||
:member Null 0 | ||
:member True 1 | ||
:member False 2 | ||
:member Number 3 | ||
:member String 4 | ||
:member Pair 5 | ||
:member Object 6 | ||
:member Array 7 | ||
|
||
// Here we declare the grammar itself, which describes what patterns to check | ||
// for in the language and how to tag them with tokens when we find them. | ||
// We use `PEG.DSL` here to instantiate patterns succinctly, using `@`-calls | ||
// to create new patterns from scratch, method calls to create outer patterns | ||
// from an inner pattern, and operator sugar to compose patterns together. | ||
:class val _JSON.Grammar | ||
:is PEG.Grammar(_JSON.Token) | ||
:new val | ||
// Define what whitespace looks like. | ||
whitespace = @char(' ') / @char('\t') / @char('\r') / @char('\n') | ||
s = whitespace.repeat | ||
|
||
// The root of a JSON document is an array or an object, | ||
// optionally surrounded by some whitespace. | ||
array = @declare | ||
object = @declare | ||
@root = (s >> (array / object) >> s).then_eof | ||
|
||
// Define what a number looks like. | ||
digit19 = @range('1', '9') | ||
digit = @range('0', '9') | ||
digits = digit.repeat(1) | ||
int = ( | ||
(@char('-') >> digit19 >> digit.repeat) | ||
/ (@char('-') >> digit) | ||
/ (digit19 >> digit.repeat) | ||
/ digit | ||
) | ||
frac = @char('.') >> digits | ||
exp = (@char('e') / @char('E')) >> (@char('+') / @char('-')).maybe >> digits | ||
number = ( | ||
int >> frac.maybe >> exp.maybe | ||
).token(_JSON.Token.Number) | ||
|
||
// Define what a string looks like. | ||
hex = digit / @range('a', 'f') / @range('A', 'F') | ||
string_char = ( | ||
@str("\\\"") / @str("\\\\") / @str("\\/") | ||
/ @str("\\b") / @str("\\f") / @str("\\n") / @str("\\r") / @str("\\t") | ||
/ (@str("\\u") >> hex >> hex >> hex >> hex) | ||
/ (@char('"').not >> @char('\\').not >> @any) | ||
) | ||
string = ( | ||
@char('"') | ||
>> string_char.repeat.token(_JSON.Token.String) | ||
>> @char('"') | ||
) | ||
|
||
// Define what constitutes a value. | ||
value = ( | ||
@str("null").token(_JSON.Token.Null) | ||
/ @str("true").token(_JSON.Token.True) | ||
/ @str("false").token(_JSON.Token.False) | ||
/ number / string | ||
/ array / object | ||
) | ||
|
||
// Define what an array is, in terms of zero or more values. | ||
values = value >> s >> (@char(',') >> s >> value >> s).repeat | ||
array.define( | ||
( | ||
@char('[') | ||
>> s >> values.maybe | ||
>> s >> @char(']') | ||
).token(_JSON.Token.Array) | ||
) | ||
|
||
// Define what an object is, in terms of zero or more key/value pairs. | ||
pair = (string >> s >> @char(':') >> s >> value).token(_JSON.Token.Pair) | ||
pairs = pair >> s >> (@char(',') >> s >> pair >> s).repeat | ||
object.define( | ||
( | ||
@char('{') | ||
>> s >> pairs.maybe | ||
>> s >> @char('}') | ||
).token(_JSON.Token.Object) | ||
) | ||
|
||
// Finally, we also need to declare a builder that can assemble the stream | ||
// of tokens into some output type. In this case our output type will be | ||
// a simple string, printing the offset range and kind of each token. | ||
// | ||
// In a real-world use case, instead of outputting a `String`, you'd likely | ||
// output an AST data structure of some kind instead, or if you're building | ||
// a very simple interpreter, perhaps you could directly build the result value. | ||
:class _JSON.TokenPrinter | ||
:is PEG.Parser.Builder(_JSON.Token, String) | ||
|
||
:fun ref build(tokens Array(PEG.Token(_JSON.Token))'val) String | ||
out = String.new_iso | ||
tokens.each -> (token | | ||
out << "\(token.start)-\(token.end): \(token.kind)\n" | ||
) | ||
--out |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
:: A trait of convenience methods for creating patterns. | ||
:: | ||
:: This DSL is meant for use in composing a `PEG.Grammar`. | ||
:trait PEG.DSL(T PEG.Token.Kind) | ||
:: Introduce a forward-declaration of a pattern to be defined later. | ||
:: This allows patterns to be a recursive/cyclical graph rather than acyclic. | ||
:fun non declare: PEG.Pattern.Forward(T).new | ||
|
||
:: Check for any Unicode codepoint, failing at end of input or invalid data. | ||
:fun non any: PEG.Pattern.UnicodeAny(T) | ||
|
||
:: Check for exact matches with the given literal string. | ||
:: As an optimization, returns `PEG.Pattern.Byte` if the string is one byte. | ||
:fun non str(text String) | ||
try ( | ||
if (text.size == 1) ( | ||
pattern = PEG.Pattern.Byte(T).new(text.byte_at!(0)) | ||
return pattern // TODO: make these two lines a single line | ||
) | ||
) | ||
PEG.Pattern.Literal(T).new(text) | ||
|
||
:: Check for exact matches with the given Unicode codepoint. | ||
:: As an optimization, returns `PEG.Pattern.Byte` if the string is one byte. | ||
:fun non char(codepoint U32) | ||
if (codepoint < 0x80) ( | ||
return PEG.Pattern.Byte(T).new(codepoint.u8) | ||
) | ||
PEG.Pattern.Literal(T).new("\(codepoint.format.unicode)") | ||
|
||
:: Check for a given range of Unicode codepoints, inclusive of both ends. | ||
:fun non range(min, max): PEG.Pattern.UnicodeRange(T).new(min, max) | ||
|
||
:: A trait of convenience methods/operators for composing patterns based | ||
:: on a given base pattern (the receiver of the method call). | ||
:: | ||
:: This DSL is meant for use in composing a `PEG.Grammar`. | ||
:trait PEG.DSL.Methods(T PEG.Token.Kind) | ||
// TODO: Remove the need for this duplication with `PEG.Pattern`, without | ||
// causing the compiler to enter an infinite recursion of trait copying. | ||
:fun description String | ||
:is IntoString | ||
:fun val check_match(source String, offset USize, state _State(T)) _Result(T) | ||
:fun val match(source String, offset USize, state _State(T)) _Result(T) | ||
// TODO: Remove the above lines ^ | ||
|
||
:: Chain the given pattern after this one, in a sequence. | ||
:fun ref ">>"(other): PEG.Pattern.Sequence(T).new([@, other]) | ||
|
||
:: Allow an ordered choice between this pattern and the given other one, | ||
:: instructing the parser to try to match the other one if this one fails. | ||
:fun ref "/"(other): PEG.Pattern.Choice(T).new([@, other]) | ||
|
||
:: Use this pattern for "negative lookahead", checking what's ahead | ||
:: to ensure it doesn't match the pattern, without actually consuming it. | ||
:: Chain this method twice to get "positive lookahead", checking the opposite. | ||
:fun ref not: PEG.Pattern.Not(T).new(@) | ||
// TODO: :fun ref "~": PEG.Pattern.Not(T).new(@) | ||
|
||
:: Check for this pattern to repeat at least the given number of times, | ||
:: consuming it as many times as needed until it no longer matches. | ||
:fun ref repeat(min USize = 0): PEG.Pattern.Repeat(T).new(@, min) | ||
|
||
:: Allow this pattern to be optional, avoiding failure if it doesn't match. | ||
:fun ref maybe: PEG.Pattern.Optional(T).new(@) | ||
|
||
:: Follow this pattern by checking to ensure that the end of input is reached. | ||
:fun ref then_eof: PEG.Pattern.EOF(T).new(@) | ||
|
||
:: When this pattern is matched, mark a token for the content it matched. | ||
:fun ref token(kind T): PEG.Pattern.Tokenize(T).new(@, kind) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
:trait val PEG.Grammar(T PEG.Token.Kind) | ||
:copies PEG.DSL(T) | ||
|
||
:let root PEG.Pattern(T) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
:trait PEG.Parser.Builder(T PEG.Token.Kind, O Any'any) | ||
:fun ref build(tokens Array(PEG.Token(T))'val) O | ||
:fun ref reset: None | ||
|
||
:class PEG.Parser(T PEG.Token.Kind, O Any'any) | ||
:let grammar PEG.Grammar(T) | ||
:let builder PEG.Parser.Builder(T, O) | ||
:new (@grammar, @builder) | ||
|
||
:var last_parse_byte_size USize: 0 | ||
:var last_parse_fail_pattern (PEG.Pattern(T)'val | None): None | ||
|
||
:fun ref parse!(source, offset = 0) O | ||
@builder.reset | ||
|
||
result = @grammar.root.match(source, offset, _State(T).new) | ||
@last_parse_byte_size = result.length | ||
|
||
try ( | ||
@builder.build(result.success_tokens!) | ||
| | ||
@last_parse_fail_pattern = result.fail_pattern! | ||
error! | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
:: `PEG.Pattern.Byte` is used to consume a single specific byte. | ||
:: | ||
:: Parsing will fail if the next byte in the string doesn't match that byte. | ||
:: Otherwise, the pattern succeeds, consuming the matched bytes. | ||
:class PEG.Pattern.Byte(T PEG.Token.Kind) | ||
:is PEG.Pattern(T) | ||
|
||
:let expected U8 | ||
:new (@expected) | ||
|
||
:fun description: "\(@expected.format.unicode)" | ||
|
||
:is IntoString | ||
:fun into_string_space USize | ||
"char('')".size + @expected.format.printable_ascii.into_string_space | ||
:fun into_string(out String'iso) | ||
out << "char('" | ||
out = @expected.format.printable_ascii.into_string(--out) | ||
out << "')" | ||
--out | ||
|
||
:fun val check_match( | ||
source String | ||
offset USize | ||
state _State(T) | ||
) _Result(T) | ||
try ( | ||
error! if (source.byte_at!(offset) != @expected) | ||
_Result(T).success(1) | ||
| | ||
_Result(T).fail(0, @) | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
:: `PEG.Pattern.Choice` is used to specify an ordered choice of patterns. | ||
:: | ||
:: Returns the result of the first child pattern that matched. | ||
:: Returns the longest-length failure of all child patterns fail. | ||
:class PEG.Pattern.Choice(T PEG.Token.Kind) | ||
:is PEG.Pattern(T) | ||
|
||
:let children Array(PEG.Pattern(T)) | ||
:new (@children) | ||
|
||
:: Override this DSL operator to accrue into the existing choice. | ||
:fun "/"(other PEG.Pattern(T)) | ||
children ref = @children.clone | ||
children << other | ||
@new(--children) | ||
|
||
:fun description | ||
if (@children.size < 2) ( | ||
try (@children[0]!.description | "empty choice!") | ||
| | ||
out = String.new_iso | ||
@children.each_with_index -> (child, index | | ||
if index.is_zero ( | ||
out << "either " | ||
out << child.description | ||
| | ||
out << ", or " | ||
out << child.description | ||
) | ||
) | ||
--out | ||
) | ||
|
||
:is IntoString | ||
:fun into_string_space USize | ||
space = (@children.size - 1) * 3 + 2 | ||
@children.each -> (child | space += child.into_string_space) | ||
space | ||
:fun into_string(out String'iso) String'iso | ||
out << "(" | ||
@children.each_with_index -> (child, index | | ||
if index.is_nonzero (out << " / ") | ||
out = child.into_string(--out) | ||
) | ||
out << ")" | ||
--out | ||
|
||
:fun val check_match( | ||
source String | ||
offset USize | ||
state _State(T) | ||
) _Result(T) | ||
fail_result = _Result(T).fail(0, @) | ||
|
||
// Try each child pattern in order, looking for the first successful match. | ||
@children.each_with_index -> (child, child_index | | ||
result = child.match(source, offset, state) | ||
|
||
// On first success, return the result. | ||
if result.is_success ( | ||
return result | ||
) | ||
|
||
// On failure, record the info if this is the longest failure yet seen. | ||
if (result.length > fail_result.length) ( | ||
fail_result = result | ||
) | ||
) | ||
|
||
state.observe_fail( | ||
offset + fail_result.length | ||
try (fail_result.fail_pattern! | @) | ||
) | ||
fail_result |
Oops, something went wrong.