Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JSON5 Support #152

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
27b5cd8
init json5-parser
gorm-issuu Aug 10, 2020
92e2615
add numbers
gorm-issuu Aug 10, 2020
912cea1
not sure what is here
gorm-issuu Oct 29, 2020
d7cf1c6
Add sedlex
gorm-issuu Oct 29, 2020
7cb17ba
add support for comments
gorm-issuu Oct 29, 2020
b1dc1bf
Add ppx_deriving dependency
gertsonderby Oct 30, 2020
3b56fd6
Test file for JSON5 lexing
gertsonderby Oct 30, 2020
03eefbd
Basic strings, escape seqs
gertsonderby Oct 30, 2020
a5931f9
skeleton of parser
gorm-issuu Nov 5, 2020
5a12eb2
implement parsing of lists
gorm-issuu Nov 5, 2020
08fd847
implement parsing of objects
gorm-issuu Nov 6, 2020
23d62ae
Add JSON5 code
dhilst Sep 25, 2022
f55e517
Add unicode, hex, octal, \n and friends escaping support
dhilst Sep 25, 2022
66dc605
Add more tests for JSON5
dhilst Sep 11, 2022
59659e2
Add more tests for JSON5
dhilst Sep 11, 2022
01e30dd
Update Changelog
Leonidas-from-XIV Apr 26, 2024
3d40870
Extend tests and split them into their own test cases
Leonidas-from-XIV Apr 26, 2024
afcb738
Fix issue with quoting
Leonidas-from-XIV Apr 26, 2024
6514db3
Simplify the code with `Result.map` operators
Leonidas-from-XIV Apr 26, 2024
e976079
Add test cases for trailing commas
Leonidas-from-XIV Apr 26, 2024
fd07bd7
Unify all tests to use `Alcotest.result` to compare
Leonidas-from-XIV Apr 26, 2024
dbe0b81
Fix handling of multiple trailing commas
Leonidas-from-XIV May 30, 2024
5a81430
More tests and better argument order
Leonidas-from-XIV May 30, 2024
1df1df5
Do not depend on `fmt`
Leonidas-from-XIV May 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

### Added

- Added support for JSON5 (@dhilst, @gorm-issuu, @gertsonderby, #152)

### Changed

### Deprecated
Expand Down
11 changes: 11 additions & 0 deletions dune-project
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,14 @@ meant for developers that are worried about performance changes in Yojson.")
(core (>= v0.14.0))
(core_unix (>= v0.14.0))
(sexplib (>= v0.9.0))))

(package
(name yojson-json5)
(synopsis "Yojson_json5 is a parsing and printing library for the JSON5 format")
(description "Yojson_json5 is a parsing and printing library for the JSON5 format.
It supports parsing JSON5 to Yojson.Basic.t and Yojson.Safe.t types.")
(depends
(ocaml (>= 4.08))
(sedlex (>= 2.5))
(alcotest (and :with-test (>= 0.8.5)))))

27 changes: 27 additions & 0 deletions lib/json5/ast.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
type t =
| Assoc of (string * t) list
| List of t list
| StringLit of string
| IntLit of string
| FloatLit of string
| Bool of bool
| Null

let rec to_basic = function
| Assoc l -> `Assoc (List.map (fun (name, obj) -> (name, to_basic obj)) l)
| List l -> `List (List.map to_basic l)
| StringLit s -> `String s
| FloatLit s -> `Float (float_of_string s)
| IntLit s -> `Int (int_of_string s)
| Bool b -> `Bool b
| Null -> `Null

let rec to_safe = function
| Assoc l -> `Assoc (List.map (fun (name, obj) -> (name, to_safe obj)) l)
| List l -> `List (List.map to_safe l)
| StringLit s -> `String s
| FloatLit s -> `Float (float_of_string s)
| IntLit s -> (
match int_of_string_opt s with Some i -> `Int i | None -> `Intlit s)
| Bool b -> `Bool b
| Null -> `Null
7 changes: 7 additions & 0 deletions lib/json5/basic.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
include Yojson.Basic

include Read.Make (struct
type t = Yojson.Basic.t

let convert = Ast.to_basic
end)
6 changes: 6 additions & 0 deletions lib/json5/dune
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
(library
(name yojson_json5)
(public_name yojson-json5)
(libraries yojson sedlex)
(preprocess
(pps sedlex.ppx)))
4 changes: 4 additions & 0 deletions lib/json5/let_syntax.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
module Result = struct
let ( let* ) = Result.bind
let ( let+ ) v f = Result.map f v
end
251 changes: 251 additions & 0 deletions lib/json5/lexer.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
open Let_syntax.Result

type token =
| OPEN_PAREN
| CLOSE_PAREN
| OPEN_BRACE
| CLOSE_BRACE
| OPEN_BRACKET
| CLOSE_BRACKET
| COLON
| COMMA
| COMMENT of string
| TRUE
| FALSE
| NULL
| FLOAT of string
| INT_OR_FLOAT of string
| INT of string
| STRING of string
| IDENTIFIER_NAME of string

let pp_token ppf = function
| OPEN_PAREN -> Format.fprintf ppf "'('"
| CLOSE_PAREN -> Format.fprintf ppf "')'"
| OPEN_BRACE -> Format.fprintf ppf "'{'"
| CLOSE_BRACE -> Format.fprintf ppf "'}'"
| OPEN_BRACKET -> Format.fprintf ppf "'['"
| CLOSE_BRACKET -> Format.fprintf ppf "']'"
| COLON -> Format.fprintf ppf "':'"
| COMMA -> Format.fprintf ppf "','"
| COMMENT s -> Format.fprintf ppf "COMMENT '%s'" s
| TRUE -> Format.fprintf ppf "'true'"
| FALSE -> Format.fprintf ppf "'false'"
| NULL -> Format.fprintf ppf "'null'"
| FLOAT s -> Format.fprintf ppf "FLOAT '%s'" s
| INT_OR_FLOAT s -> Format.fprintf ppf "INT_OR_FLOAT '%s'" s
| INT s -> Format.fprintf ppf "INT '%s'" s
| STRING s -> Format.fprintf ppf "STRING '%s'" s
| IDENTIFIER_NAME s -> Format.fprintf ppf "IDENTIFIER_NAME '%s'" s

let source_character = [%sedlex.regexp? any]
let line_terminator = [%sedlex.regexp? 0x000A | 0x000D | 0x2028 | 0x2029]

let line_terminator_sequence =
[%sedlex.regexp? 0x000A | 0x000D, Opt 0x000A | 0x2028 | 0x2029]

(* NUMBERS, 7.8.3 *)
let non_zero_digit = [%sedlex.regexp? '1' .. '9']
let decimal_digit = [%sedlex.regexp? '0' .. '9']
let decimal_digits = [%sedlex.regexp? Plus decimal_digit]
let hex_digit = [%sedlex.regexp? '0' .. '9' | 'a' .. 'f' | 'A' .. 'F']
let exponent_indicator = [%sedlex.regexp? 'e' | 'E']

let signed_integer =
[%sedlex.regexp? decimal_digits | '+', decimal_digits | '-', decimal_digits]

let exponent_part = [%sedlex.regexp? exponent_indicator, signed_integer]

let decimal_integer_literal =
[%sedlex.regexp? '0' | non_zero_digit, Opt decimal_digits]

let hex_integer_literal =
[%sedlex.regexp? "0x", Plus hex_digit | "0X", Plus hex_digit]

(* float *)
let float_literal =
[%sedlex.regexp?
( decimal_integer_literal, '.', Opt decimal_digits, Opt exponent_part
| '.', decimal_digits, Opt exponent_part )]

let json5_float =
[%sedlex.regexp? float_literal | '+', float_literal | '-', float_literal]

(* int_or_float *)
let int_or_float_literal =
[%sedlex.regexp? decimal_integer_literal, Opt exponent_part]

let json5_int_or_float =
[%sedlex.regexp?
int_or_float_literal | '+', int_or_float_literal | '-', int_or_float_literal]

(* int/hex *)
let int_literal =
[%sedlex.regexp? decimal_digits | '+', decimal_digits | '-', decimal_digits]

let json5_int =
[%sedlex.regexp?
( hex_integer_literal
| '+', hex_integer_literal
| '-', hex_integer_literal
| int_literal )]

(* STRING LITERALS, 7.8.4 *)
let unicode_escape_sequence =
[%sedlex.regexp? 'u', hex_digit, hex_digit, hex_digit, hex_digit]

let single_escape_character = [%sedlex.regexp? Chars {|'"\\bfnrtv|}]

let escape_character =
[%sedlex.regexp? single_escape_character | decimal_digit | 'x' | 'u']

let non_escape_character =
[%sedlex.regexp? Sub (source_character, (escape_character | line_terminator))]

let character_escape_sequence =
[%sedlex.regexp? single_escape_character | non_escape_character]

let line_continuation = [%sedlex.regexp? '\\', line_terminator_sequence]
let hex_escape_sequence = [%sedlex.regexp? 'x', hex_digit, hex_digit]

let escape_sequence =
[%sedlex.regexp?
( character_escape_sequence
| '0', Opt (decimal_digit, decimal_digit)
| hex_escape_sequence | unicode_escape_sequence )]

let single_string_character =
[%sedlex.regexp?
( Sub (source_character, ('\'' | '\\' | line_terminator))
| '\\', escape_sequence
| line_continuation )]

let double_string_character =
[%sedlex.regexp?
( Sub (source_character, ('"' | '\\' | line_terminator))
| '\\', escape_sequence
| line_continuation )]

let string_literal =
[%sedlex.regexp?
( '"', Star double_string_character, '"'
| '\'', Star single_string_character, '\'' )]

(* IDENTIFIER_NAME (keys in objects) *)
let unicode_combining_mark = [%sedlex.regexp? mn | mc]
let unicode_digit = [%sedlex.regexp? nd]
let unicode_connector_punctuation = [%sedlex.regexp? pc]
let unicode_letter = [%sedlex.regexp? lu | ll | lt | lm | lo | nl]
let zwnj = [%sedlex.regexp? 0x200C]
let zwj = [%sedlex.regexp? 0x200D]

let identifier_start =
[%sedlex.regexp? unicode_letter | '$' | '_' | '\\', unicode_escape_sequence]

let identifier_part =
[%sedlex.regexp?
( identifier_start | unicode_combining_mark | unicode_digit
| unicode_connector_punctuation | zwnj | zwj )]

let identifier_name = [%sedlex.regexp? identifier_start, Star identifier_part]

(* COMMENTS 7.4 *)
let single_line_comment_char =
[%sedlex.regexp? Sub (source_character, line_terminator)]

let single_line_comment = [%sedlex.regexp? "//", Star single_line_comment_char]
let multi_line_not_asterisk_char = [%sedlex.regexp? Sub (source_character, '*')]
let multi_line_not_slash_char = [%sedlex.regexp? Sub (source_character, '/')]

let multi_line_comment_char =
[%sedlex.regexp?
multi_line_not_asterisk_char | '*', Plus multi_line_not_slash_char]

let multi_line_comment =
[%sedlex.regexp? "/*", Star multi_line_comment_char, Opt '*', "*/"]

let comment = [%sedlex.regexp? multi_line_comment | single_line_comment]

let white_space =
[%sedlex.regexp? 0x0009 | 0x000B | 0x000C | 0x0020 | 0x00A0 | 0xFEFF | zs]

let string_lex_single lexbuf strbuf =
let lexeme = Sedlexing.Utf8.lexeme in
let rec lex lexbuf strbuf =
match%sedlex lexbuf with
| '\'' -> Ok (Buffer.contents strbuf)
| '\\', escape_sequence ->
let* s = Unescape.unescape (lexeme lexbuf) in
Buffer.add_string strbuf s;
lex lexbuf strbuf
| line_continuation -> lex lexbuf strbuf
| Sub (source_character, ('\'' | line_terminator)) ->
Buffer.add_string strbuf (lexeme lexbuf);
lex lexbuf strbuf
| _ ->
lexeme lexbuf
|> Format.sprintf "Unexpected character: %s"
|> Result.error
in
lex lexbuf strbuf

let string_lex_double lexbuf strbuf =
let lexeme = Sedlexing.Utf8.lexeme in
let rec lex lexbuf strbuf =
match%sedlex lexbuf with
| '"' -> Ok (Buffer.contents strbuf)
| '\\', escape_sequence ->
let* s = Unescape.unescape (lexeme lexbuf) in
Buffer.add_string strbuf s;
lex lexbuf strbuf
| line_continuation -> lex lexbuf strbuf
| Sub (source_character, ('"' | line_terminator)) ->
Buffer.add_string strbuf (lexeme lexbuf);
lex lexbuf strbuf
| _ ->
lexeme lexbuf
|> Format.sprintf "Unexpected character: %s"
|> Result.error
in
lex lexbuf strbuf

let string_lex lexbuf quote =
let strbuf = Buffer.create 200 in
if quote = "'" then string_lex_single lexbuf strbuf
else if quote = {|"|} then string_lex_double lexbuf strbuf
else Error (Format.sprintf "Invalid string quote %S" quote)

let rec lex tokens buf =
let lexeme = Sedlexing.Utf8.lexeme in
match%sedlex buf with
| '(' -> lex (OPEN_PAREN :: tokens) buf
| ')' -> lex (CLOSE_PAREN :: tokens) buf
| '{' -> lex (OPEN_BRACE :: tokens) buf
| '}' -> lex (CLOSE_BRACE :: tokens) buf
| '[' -> lex (OPEN_BRACKET :: tokens) buf
| ']' -> lex (CLOSE_BRACKET :: tokens) buf
| ':' -> lex (COLON :: tokens) buf
| ',' -> lex (COMMA :: tokens) buf
| Chars {|"'|} ->
let* s = string_lex buf (lexeme buf) in
lex (STRING s :: tokens) buf
| multi_line_comment | single_line_comment | white_space | line_terminator ->
lex tokens buf
| "true" -> lex (TRUE :: tokens) buf
| "false" -> lex (FALSE :: tokens) buf
| "null" -> lex (NULL :: tokens) buf
| json5_float ->
let s = lexeme buf in
lex (FLOAT s :: tokens) buf
| json5_int ->
let s = lexeme buf in
lex (INT s :: tokens) buf
| json5_int_or_float ->
let s = lexeme buf in
lex (INT_OR_FLOAT s :: tokens) buf
| identifier_name ->
let s = lexeme buf in
lex (IDENTIFIER_NAME s :: tokens) buf
| eof -> Ok (List.rev tokens)
| _ ->
lexeme buf |> Format.asprintf "Unexpected character: '%s'" |> Result.error
Loading
Loading