From 6e623ce55628f4afdd745b1c5482ccb333a63558 Mon Sep 17 00:00:00 2001 From: Daniel Hilst Date: Sun, 25 Sep 2022 09:42:08 -0300 Subject: [PATCH] Add unicode, hex, octal, \n and friends escaping support --- lib/json5/lexer.ml | 69 ++++++++++++++++++++++++++++++++---- lib/json5/unescape.ml | 81 +++++++++++++++++++++++++++++++++++++++++++ test_json5/test.ml | 14 ++++++++ 3 files changed, 157 insertions(+), 7 deletions(-) create mode 100644 lib/json5/unescape.ml diff --git a/lib/json5/lexer.ml b/lib/json5/lexer.ml index ebadfd7b..611ac09a 100644 --- a/lib/json5/lexer.ml +++ b/lib/json5/lexer.ml @@ -1,3 +1,5 @@ +open Let_syntax.Result + type token = | OPEN_PAREN | CLOSE_PAREN @@ -92,7 +94,7 @@ let json5_int = let unicode_escape_sequence = [%sedlex.regexp? 'u', hex_digit, hex_digit, hex_digit, hex_digit] -let single_escape_character = [%sedlex.regexp? Chars {|'"\\bfnrtv|}] +let single_escape_character = [%sedlex.regexp? Chars {|'"\bfnrtv|}] let escape_character = [%sedlex.regexp? single_escape_character | decimal_digit | 'x' | 'u'] @@ -104,9 +106,13 @@ let character_escape_sequence = [%sedlex.regexp? single_escape_character | non_escape_character] let line_continuation = [%sedlex.regexp? '\\', line_terminator_sequence] +let hex_escape_sequence = [%sedlex.regexp? 'x', hex_digit, hex_digit] let escape_sequence = - [%sedlex.regexp? character_escape_sequence | '0' | unicode_escape_sequence] + [%sedlex.regexp? + ( character_escape_sequence + | '0', Opt (decimal_digit, decimal_digit) + | hex_escape_sequence | unicode_escape_sequence )] let single_string_character = [%sedlex.regexp? @@ -163,8 +169,57 @@ let comment = [%sedlex.regexp? multi_line_comment | single_line_comment] let white_space = [%sedlex.regexp? 0x0009 | 0x000B | 0x000C | 0x0020 | 0x00A0 | 0xFEFF | zs] -let rec lex : token list -> Sedlexing.lexbuf -> (token list, string) result = - fun tokens buf -> +let string_lex_single lexbuf strbuf = + Buffer.add_char strbuf '\''; + let lexeme = Sedlexing.Utf8.lexeme in + let rec lex lexbuf strbuf = + match%sedlex lexbuf with + | '\'' -> + Buffer.add_char strbuf '\''; + Ok (Buffer.contents strbuf) + | '\\', escape_sequence -> + let* s = Unescape.unescape (lexeme lexbuf) in + Buffer.add_string strbuf s; + lex lexbuf strbuf + | Sub (source_character, '\'') -> + Buffer.add_string strbuf (lexeme lexbuf); + lex lexbuf strbuf + | _ -> + lexeme lexbuf + |> Format.asprintf "Unexpected character: %s" + |> Result.error + in + lex lexbuf strbuf + +let string_lex_double lexbuf strbuf = + Buffer.add_char strbuf '"'; + let lexeme = Sedlexing.Utf8.lexeme in + let rec lex lexbuf strbuf = + match%sedlex lexbuf with + | '"' -> + Buffer.add_char strbuf '"'; + Ok (Buffer.contents strbuf) + | '\\', escape_sequence -> + let* s = Unescape.unescape (lexeme lexbuf) in + Buffer.add_string strbuf s; + lex lexbuf strbuf + | Sub (source_character, '"') -> + Buffer.add_string strbuf (lexeme lexbuf); + lex lexbuf strbuf + | _ -> + lexeme lexbuf + |> Format.asprintf "Unexpected character: %s" + |> Result.error + in + lex lexbuf strbuf + +let string_lex lexbuf quote = + let strbuf = Buffer.create 200 in + if quote = "'" then string_lex_single lexbuf strbuf + else if quote = "\"" then string_lex_double lexbuf strbuf + else Error (Format.sprintf "invalid string quote %s" quote) + +let rec lex tokens buf = let lexeme = Sedlexing.Utf8.lexeme in match%sedlex buf with | '(' -> lex (OPEN_PAREN :: tokens) buf @@ -175,6 +230,9 @@ let rec lex : token list -> Sedlexing.lexbuf -> (token list, string) result = | ']' -> lex (CLOSE_BRACKET :: tokens) buf | ':' -> lex (COLON :: tokens) buf | ',' -> lex (COMMA :: tokens) buf + | Chars {|"'|} -> + let* s = string_lex buf (lexeme buf) in + lex (STRING s :: tokens) buf | multi_line_comment | single_line_comment | white_space | line_terminator -> lex tokens buf | "true" -> lex (TRUE :: tokens) buf @@ -192,9 +250,6 @@ let rec lex : token list -> Sedlexing.lexbuf -> (token list, string) result = | identifier_name -> let s = lexeme buf in lex (IDENTIFIER_NAME s :: tokens) buf - | string_literal -> - let s = lexeme buf in - lex (STRING s :: tokens) buf | eof -> Ok (List.rev tokens) | _ -> lexeme buf |> Format.asprintf "Unexpected character: '%s'" |> Result.error diff --git a/lib/json5/unescape.ml b/lib/json5/unescape.ml new file mode 100644 index 00000000..63e571b0 --- /dev/null +++ b/lib/json5/unescape.ml @@ -0,0 +1,81 @@ +open Let_syntax.Result + +let ( % ) = Int.logor +let ( << ) = Int.shift_left +let ( >> ) = Int.shift_right +let ( & ) = Int.logand + +let utf_8_string_of_unicode i = + if i <= 0x007F then ( + let b = Bytes.create 1 in + Bytes.set_int8 b 0 i; + Ok (Bytes.to_string b)) + else if i <= 0x07FF then ( + let five_high_bits = i >> 6 & 0b11111 in + let six_low_bits = i & 0b111111 in + let high = 0b11000000 % five_high_bits << 8 in + let low = 0b10000000 % six_low_bits in + let n = high % low in + let b = Bytes.create 2 in + Bytes.set_int16_be b 0 n; + Ok (Bytes.to_string b)) + else if i <= 0xFFFF then ( + let four_high_bits = i >> 12 & 0b1111 in + let six_mid_bits = i >> 6 & 0b111111 in + let six_low_bits = i & 0b111111 in + let high = 0b11100000 % four_high_bits << 16 in + let mid = 0b10000000 % six_mid_bits << 8 in + let low = 0b10000000 % six_low_bits in + let n = high % mid % low in + let b = Bytes.create 3 in + Bytes.set_int32_be b 0 (Int32.of_int n); + Ok (Bytes.to_string b)) + else if i <= 0x10FFFF then ( + let three_hh_bits = i >> 18 & 0b111 in + let six_hl_bits = i >> 12 & 0b111111 in + let six_lh_bits = i >> 6 & 0b111111 in + let six_ll_bits = i & 0b111111 in + let hh = 0b11110000 % three_hh_bits << 24 in + let hl = 0b10000000 % six_hl_bits << 16 in + let lh = 0b10000000 % six_lh_bits << 8 in + let ll = 0b10000000 % six_ll_bits in + let n = hh % hl % lh % ll in + let b = Bytes.create 4 in + Bytes.set_int32_be b 0 (Int32.of_int n); + Ok (Bytes.to_string b)) + else Error (Format.sprintf "invalid code point %X" i) + +let unescape str = + if String.length str < 2 then + Error (Format.sprintf "too small escape sequence %s" str) + else + match str.[1] with + | 'u' -> + let escape_chars = String.sub str 2 4 in + let* as_int = + Format.sprintf "0x%s" escape_chars |> int_of_string_opt |> function + | Some x -> Ok x + | None -> Error (Format.sprintf "bad escape sequence %s" escape_chars) + in + utf_8_string_of_unicode as_int + | 'x' -> + let escape_chars = String.sub str 2 2 in + let* as_int = + Format.sprintf "0x%s" escape_chars |> int_of_string_opt |> function + | Some x -> Ok x + | None -> Error (Format.sprintf "bad escape sequence %s" escape_chars) + in + utf_8_string_of_unicode as_int + | '\\' | '"' | 'n' | 't' -> Ok str + | '0' -> + if String.length str = 2 then Ok "\x00" + else if String.length str = 4 then + let octal_str = String.(sub str 2 2) in + let* as_int = + Format.sprintf "0o%s" octal_str |> int_of_string_opt |> function + | Some x -> Ok x + | None -> Error (Format.sprintf "bad escape sequence %s" octal_str) + in + utf_8_string_of_unicode as_int + else Error (Format.sprintf "invalid octal sequence %s" str) + | _ -> Error (Format.sprintf "invalid escape sequence %c" str.[1]) diff --git a/test_json5/test.ml b/test_json5/test.ml index 3488a54c..1b853e57 100644 --- a/test_json5/test.ml +++ b/test_json5/test.ml @@ -29,6 +29,20 @@ let test_from_string () = "float" (`Float 12345.67890) (M.from_string "12345.67890"); Alcotest.(check yojson_json5) "hex" (`Int 0x1) (M.from_string "0x1"); + Alcotest.(check yojson_json5) + "hex escape sequence" (`String "a") (M.from_string {|"\x61"|}); + Alcotest.(check yojson_json5) + "unicode escape sequence" (`String "λ") + (M.from_string {|"\u03bb"|}); + Alcotest.(check yojson_json5) + "more string escaping" (`String "Hello λ world") + (M.from_string "\"Hello \\u03bb \\x77\\x6F\\x72\\x6C\\x64\""); + Alcotest.(check yojson_json5) + "null byte string" (`String "\x00") (M.from_string {|"\0"|}); + Alcotest.(check yojson_json5) + "octal string" (`String "?") (M.from_string {|"\077"|}); + Alcotest.(check yojson_json5) + "null and octal string" (`String "\x007") (M.from_string {|"\07"|}); Alcotest.(check yojson_json5) "int" (`Int 1) (M.from_string "1"); Alcotest.(check yojson_json5) "line break" (`String "foo\\\nbar")