Skip to content

Commit

Permalink
Add unicode and hex escaping support
Browse files Browse the repository at this point in the history
  • Loading branch information
Daniel Hilst committed Sep 4, 2022
1 parent b10e621 commit f824843
Show file tree
Hide file tree
Showing 3 changed files with 144 additions and 7 deletions.
71 changes: 64 additions & 7 deletions lib/json5/lexer.ml
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
open Let_syntax.Result

type token =
| OPEN_PAREN
| CLOSE_PAREN
Expand Down Expand Up @@ -92,7 +94,7 @@ let json5_int =
let unicode_escape_sequence =
[%sedlex.regexp? 'u', hex_digit, hex_digit, hex_digit, hex_digit]

let single_escape_character = [%sedlex.regexp? Chars {|'"\\bfnrtv|}]
let single_escape_character = [%sedlex.regexp? Chars {|'"\bfnrtv|}]
let escape_character =
[%sedlex.regexp? single_escape_character | decimal_digit | 'x' | 'u']
Expand All @@ -104,9 +106,12 @@ let character_escape_sequence =
[%sedlex.regexp? single_escape_character | non_escape_character]
let line_continuation = [%sedlex.regexp? '\\', line_terminator_sequence]
let hex_escape_sequence = [%sedlex.regexp? 'x', hex_digit, hex_digit]
let escape_sequence =
[%sedlex.regexp? character_escape_sequence | '0' | unicode_escape_sequence]
[%sedlex.regexp?
( character_escape_sequence | '0' | hex_escape_sequence
| unicode_escape_sequence )]
let single_string_character =
[%sedlex.regexp?
Expand Down Expand Up @@ -163,8 +168,60 @@ let comment = [%sedlex.regexp? multi_line_comment | single_line_comment]
let white_space =
[%sedlex.regexp? 0x0009 | 0x000B | 0x000C | 0x0020 | 0x00A0 | 0xFEFF | zs]

let rec lex : token list -> Sedlexing.lexbuf -> (token list, string) result =
fun tokens buf ->
let string_lex_single lexbuf strbuf =
Buffer.add_char strbuf '\'';
let lexeme = Sedlexing.Utf8.lexeme in
let rec lex lexbuf strbuf =
match%sedlex lexbuf with
| '\'' ->
Buffer.add_char strbuf '\'';
Ok (Buffer.contents strbuf)
| '\\', escape_sequence ->
let* s = Unescape.unescape (lexeme lexbuf) in
Buffer.add_string strbuf s;
lex lexbuf strbuf
| Sub (source_character, '\'') ->
Buffer.add_string strbuf (lexeme lexbuf);
lex lexbuf strbuf
| _ ->
lexeme lexbuf
|> Format.asprintf "Unexpected character: %s"
|> Result.error
in
lex lexbuf strbuf

let string_lex_double lexbuf strbuf =
Buffer.add_char strbuf '"';
let lexeme = Sedlexing.Utf8.lexeme in
let rec lex lexbuf strbuf =
match%sedlex lexbuf with
| '"' ->
Buffer.add_char strbuf '"';
Ok (Buffer.contents strbuf)
| '\\', escape_sequence ->
let* s = Unescape.unescape (lexeme lexbuf) in
Buffer.add_string strbuf s;
lex lexbuf strbuf
| Sub (source_character, '"') ->
Buffer.add_string strbuf (lexeme lexbuf);
lex lexbuf strbuf
| _ ->
lexeme lexbuf
|> Format.asprintf "Unexpected character: %s"
|> Result.error
in
lex lexbuf strbuf

let string_lex lexbuf quote =
let strbuf = Buffer.create 200 in
(* I would like to abstract the quote but I couldn't create
sedlex regepxs at runtime *)
if quote = "'" then string_lex_single lexbuf strbuf
else if quote = "\"" then string_lex_double lexbuf strbuf
else Error (Format.sprintf "invalid string quote %s" quote)
(* should never happen *)

let rec lex tokens buf =
let lexeme = Sedlexing.Utf8.lexeme in
match%sedlex buf with
| '(' -> lex (OPEN_PAREN :: tokens) buf
Expand All @@ -175,6 +232,9 @@ let rec lex : token list -> Sedlexing.lexbuf -> (token list, string) result =
| ']' -> lex (CLOSE_BRACKET :: tokens) buf
| ':' -> lex (COLON :: tokens) buf
| ',' -> lex (COMMA :: tokens) buf
| Chars {|"'|} ->
let* s = string_lex buf (lexeme buf) in
lex (STRING s :: tokens) buf
| multi_line_comment | single_line_comment | white_space | line_terminator ->
lex tokens buf
| "true" -> lex (TRUE :: tokens) buf
Expand All @@ -192,9 +252,6 @@ let rec lex : token list -> Sedlexing.lexbuf -> (token list, string) result =
| identifier_name ->
let s = lexeme buf in
lex (IDENTIFIER_NAME s :: tokens) buf
| string_literal ->
let s = lexeme buf in
lex (STRING s :: tokens) buf
| eof -> Ok (List.rev tokens)
| _ ->
lexeme buf |> Format.asprintf "Unexpected character: '%s'" |> Result.error
72 changes: 72 additions & 0 deletions lib/json5/unescape.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
open Let_syntax.Result

let ( % ) = Int.logor
let ( << ) = Int.shift_left
let ( >> ) = Int.shift_right
let ( & ) = Int.logand

let utf_8_string_of_unicode i =
if i <= 0x007F then (
let b = Bytes.create 1 in
Bytes.set_int8 b 0 i;
Ok (Bytes.to_string b))
else if i <= 0x07FF then (
let five_high_bits = i >> 6 & 0b11111 in
let six_low_bits = i & 0b111111 in
let high = 0b11000000 % five_high_bits << 8 in
let low = 0b10000000 % six_low_bits in
let n = high % low in
let b = Bytes.create 2 in
Bytes.set_int16_be b 0 n;
Ok (Bytes.to_string b))
else if i <= 0xFFFF then (
let four_high_bits = i >> 12 & 0b1111 in
let six_mid_bits = i >> 6 & 0b111111 in
let six_low_bits = i & 0b111111 in
let high = 0b11100000 % four_high_bits << 16 in
let mid = 0b10000000 % six_mid_bits << 8 in
let low = 0b10000000 % six_low_bits in
let n = high % mid % low in
let b = Bytes.create 3 in
Bytes.set_int32_be b 0 (Int32.of_int n);
Ok (Bytes.to_string b))
else if i <= 0x10FFFF then (
let three_hh_bits = i >> 18 & 0b111 in
let six_hl_bits = i >> 12 & 0b111111 in
let six_lh_bits = i >> 6 & 0b111111 in
let six_ll_bits = i & 0b111111 in
let hh = 0b11110000 % three_hh_bits << 24 in
let hl = 0b10000000 % six_hl_bits << 16 in
let lh = 0b10000000 % six_lh_bits << 8 in
let ll = 0b10000000 % six_ll_bits in
let n = hh % hl % lh % ll in
let b = Bytes.create 4 in
Bytes.set_int32_be b 0 (Int32.of_int n);
Ok (Bytes.to_string b))
else Error (Format.sprintf "invalid code point %X" i)

let unescape str =
if String.length str < 2 then
Error (Format.sprintf "too small escape sequence %s" str)
else
match str.[1] with
| 'u' ->
let escape_chars = String.sub str 2 4 in
let* as_int =
Format.sprintf "0x%s" escape_chars |> int_of_string_opt |> function
| Some x -> Ok x
| None ->
Error (Format.sprintf "bad escape sequence %s" escape_chars)
in
utf_8_string_of_unicode as_int
| 'x' ->
let escape_chars = String.sub str 2 2 in
let* as_int =
Format.sprintf "0x%s" escape_chars |> int_of_string_opt |> function
| Some x -> Ok x
| None ->
Error (Format.sprintf "bad escape sequence %s" escape_chars)
in
utf_8_string_of_unicode as_int
| '\\' | '"' | 'n' | 't' -> Ok str
| _ -> Error (Format.sprintf "invalid escape sequence %c" str.[1])
8 changes: 8 additions & 0 deletions test_json5/test.ml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,14 @@ let test_from_string () =
"float" (`Float 12345.67890)
(M.from_string "12345.67890");
Alcotest.(check yojson_json5) "hex" (`Int 0x1) (M.from_string "0x1");
Alcotest.(check yojson_json5)
"hex escape sequence" (`String "a") (M.from_string {|"\x61"|});
Alcotest.(check yojson_json5)
"unicode escape sequence" (`String "λ")
(M.from_string {|"\u03bb"|});
Alcotest.(check yojson_json5)
"more string escaping" (`String "Hello λ world")
(M.from_string "\"Hello \\u03bb \\x77\\x6F\\x72\\x6C\\x64\"");
Alcotest.(check yojson_json5) "int" (`Int 1) (M.from_string "1");
Alcotest.(check yojson_json5)
"line break" (`String "foo\\\nbar")
Expand Down

0 comments on commit f824843

Please sign in to comment.