From cdb009957dc19ec58a42aefd0bf3b5b189c22112 Mon Sep 17 00:00:00 2001 From: Jules Aguillon Date: Wed, 20 Nov 2024 15:13:47 +0100 Subject: [PATCH] 5.3 support: Backport utf8 identifiers (#2622) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Vendor required code from Misc.Utf8_lexeme, String, Bytes and Uchar. Backport utf8 identifiers to standard and extended parser. This reduces the diff with upstream. * test: Add utf8_identifiers.ml Taken from the compiler source. * test_branch: Ignore parsing errors on 'infer' The project contains a syntax error, that is new in OCaml 5.3: (** [x < min({x'|x'∊l})] *) --- CHANGES.md | 5 +- test-extra/Makefile | 4 +- test/failing/tests/unit_lex.ml.broken-ref | 9 +- test/failing/tests/unit_values.ml.broken-ref | 11 +- test/passing/gen/dune.inc | 15 + .../refs.default/utf8_identifiers.ml.ref | 18 + .../refs.janestreet/utf8_identifiers.ml.ref | 19 + .../refs.ocamlformat/utf8_identifiers.ml.ref | 18 + test/passing/tests/utf8_identifiers.ml | 18 + vendor/parser-extended/lexer.mll | 153 +++++-- vendor/parser-shims/misc_.ml | 393 ++++++++++++++++++ .../parser-shims/ocamlformat_parser_shims.ml | 166 +------- .../stdlib_shims/ocamlformat_stdlib_shims.ml | 115 +++++ vendor/parser-standard/lexer.mll | 149 +++++-- 14 files changed, 825 insertions(+), 268 deletions(-) create mode 100644 test/passing/refs.default/utf8_identifiers.ml.ref create mode 100644 test/passing/refs.janestreet/utf8_identifiers.ml.ref create mode 100644 test/passing/refs.ocamlformat/utf8_identifiers.ml.ref create mode 100644 test/passing/tests/utf8_identifiers.ml create mode 100644 vendor/parser-shims/misc_.ml diff --git a/CHANGES.md b/CHANGES.md index 7efb810f96..70a1788e76 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -14,8 +14,9 @@ profile. This started with version 0.26.0. This might change the formatting of some functions due to the formatting code being completely rewritten. -- Support OCaml 5.3 syntax (#2609, #2610, #2611, @Julow) - This adds support for short functor type arguments syntax. +- Support OCaml 5.3 syntax (#2609, #2610, #2611, #2622, @Julow) + This adds support for short functor type arguments syntax and utf8 + identifiers. - Documentation comments are now formatted by default (#2390, @Julow) Use the option `parse-docstrings = false` to restore the previous behavior. diff --git a/test-extra/Makefile b/test-extra/Makefile index 7ddae35ca8..54ca068ddd 100644 --- a/test-extra/Makefile +++ b/test-extra/Makefile @@ -13,11 +13,11 @@ # make DIRS= test # By default, test projects used as regression tests DIRS= \ - code/ocamlformat code/infer code/js_of_ocaml code/dune code/irmin \ + code/ocamlformat code/js_of_ocaml code/dune code/irmin \ code/dune-release code/mirage code/ppxlib code/base # Extra test directories, for which looser checking is done -XDIRS=code/ocaml +XDIRS=code/ocaml code/infer # Directories to ignore (given to find, compared literally) PRUNE_DIRS= \ diff --git a/test/failing/tests/unit_lex.ml.broken-ref b/test/failing/tests/unit_lex.ml.broken-ref index fd57965f35..e767e1d6c8 100644 --- a/test/failing/tests/unit_lex.ml.broken-ref +++ b/test/failing/tests/unit_lex.ml.broken-ref @@ -1,10 +1,5 @@ +ocamlformat: ignoring "tests/unit_lex.ml" (syntax error) File "tests/unit_lex.ml", line 18, characters 4-10: 18 | ������ (* this file must be iso-8859-1 *) ^^^^^^ -Alert deprecated: ISO-Latin1 characters in identifiers -ocamlformat: ignoring "tests/unit_lex.ml" (syntax error) - -File "tests/unit_lex.ml", line 55, characters 2-8: -55 | '\999'; (* wrong, but yet... *) - ^^^^^^ -Error: Illegal backslash escape in string or character ('\999'): 999 is outside the range of legal characters (0-255). +Error: Invalid encoding of identifier ������. diff --git a/test/failing/tests/unit_values.ml.broken-ref b/test/failing/tests/unit_values.ml.broken-ref index 95354696a4..3475fbf338 100644 --- a/test/failing/tests/unit_values.ml.broken-ref +++ b/test/failing/tests/unit_values.ml.broken-ref @@ -1,10 +1,5 @@ -File "tests/unit_values.ml", line 6, characters 10-11: -6 | let i32 = −1073741824, 1073741823 - ^ -Alert deprecated: ISO-Latin1 characters in identifiers ocamlformat: ignoring "tests/unit_values.ml" (syntax error) - -File "tests/unit_values.ml", line 6, characters 11-12: +File "tests/unit_values.ml", line 6, characters 10-23: 6 | let i32 = −1073741824, 1073741823 - ^ -Error: Illegal character (\136) + ^^^^^^^^^^^^^ +Error: Invalid character U+2212 in identifier diff --git a/test/passing/gen/dune.inc b/test/passing/gen/dune.inc index e78aeaea66..ff65935a81 100644 --- a/test/passing/gen/dune.inc +++ b/test/passing/gen/dune.inc @@ -4525,6 +4525,21 @@ (alias runtest) (action (diff use_file.mlt.err use_file.mlt.stderr))) +(rule + (deps .ocamlformat dune-project) + (action + (with-stdout-to utf8_identifiers.ml.stdout + (with-stderr-to utf8_identifiers.ml.stderr + (run %{bin:ocamlformat} --name utf8_identifiers.ml --margin-check %{dep:../tests/utf8_identifiers.ml}))))) + +(rule + (alias runtest) + (action (diff utf8_identifiers.ml.ref utf8_identifiers.ml.stdout))) + +(rule + (alias runtest) + (action (diff utf8_identifiers.ml.err utf8_identifiers.ml.stderr))) + (rule (deps .ocamlformat dune-project) (action diff --git a/test/passing/refs.default/utf8_identifiers.ml.ref b/test/passing/refs.default/utf8_identifiers.ml.ref new file mode 100644 index 0000000000..683cf9c3ae --- /dev/null +++ b/test/passing/refs.default/utf8_identifiers.ml.ref @@ -0,0 +1,18 @@ +(* TEST +readonly_files = "genfiles.ml"; +setup-ocamlc.byte-build-env; +all_modules = "genfiles.ml"; +program = "./genfiles.byte.exe"; +ocamlc.byte; +run; +all_modules = "été.ml ça.ml test.ml"; +program = "./main.byte.exe"; +ocamlc.byte; +run; +*) + +let _ = + (* Source is NFC *) + assert (Été.x + Ça.x = 3); + (* Source is NFD *) + assert (Été.x + Ça.x = 3) diff --git a/test/passing/refs.janestreet/utf8_identifiers.ml.ref b/test/passing/refs.janestreet/utf8_identifiers.ml.ref new file mode 100644 index 0000000000..87f5c89d00 --- /dev/null +++ b/test/passing/refs.janestreet/utf8_identifiers.ml.ref @@ -0,0 +1,19 @@ +(* TEST +readonly_files = "genfiles.ml"; +setup-ocamlc.byte-build-env; +all_modules = "genfiles.ml"; +program = "./genfiles.byte.exe"; +ocamlc.byte; +run; +all_modules = "été.ml ça.ml test.ml"; +program = "./main.byte.exe"; +ocamlc.byte; +run; +*) + +let _ = + (* Source is NFC *) + assert (Été.x + Ça.x = 3); + (* Source is NFD *) + assert (Été.x + Ça.x = 3) +;; diff --git a/test/passing/refs.ocamlformat/utf8_identifiers.ml.ref b/test/passing/refs.ocamlformat/utf8_identifiers.ml.ref new file mode 100644 index 0000000000..f28103171f --- /dev/null +++ b/test/passing/refs.ocamlformat/utf8_identifiers.ml.ref @@ -0,0 +1,18 @@ +(* TEST +readonly_files = "genfiles.ml"; +setup-ocamlc.byte-build-env; +all_modules = "genfiles.ml"; +program = "./genfiles.byte.exe"; +ocamlc.byte; +run; +all_modules = "été.ml ça.ml test.ml"; +program = "./main.byte.exe"; +ocamlc.byte; +run; +*) + +let _ = + (* Source is NFC *) + assert (Été.x + Ça.x = 3) ; + (* Source is NFD *) + assert (Été.x + Ça.x = 3) diff --git a/test/passing/tests/utf8_identifiers.ml b/test/passing/tests/utf8_identifiers.ml new file mode 100644 index 0000000000..b9c8cb0d5b --- /dev/null +++ b/test/passing/tests/utf8_identifiers.ml @@ -0,0 +1,18 @@ +(* TEST +readonly_files = "genfiles.ml"; +setup-ocamlc.byte-build-env; +all_modules = "genfiles.ml"; +program = "./genfiles.byte.exe"; +ocamlc.byte; +run; +all_modules = "été.ml ça.ml test.ml"; +program = "./main.byte.exe"; +ocamlc.byte; +run; +*) + +let _ = + (* Source is NFC *) + assert (Été.x + Ça.x = 3); + (* Source is NFD *) + assert (Été.x + Ça.x = 3) diff --git a/vendor/parser-extended/lexer.mll b/vendor/parser-extended/lexer.mll index 4abdb11f19..e1941d0a63 100644 --- a/vendor/parser-extended/lexer.mll +++ b/vendor/parser-extended/lexer.mll @@ -29,8 +29,13 @@ type error = | Unterminated_string_in_comment of Location.t * Location.t | Empty_character_literal | Keyword_as_label of string + | Capitalized_label of string | Invalid_literal of string | Invalid_directive of string * string option + | Invalid_encoding of string + | Invalid_char_in_ident of Uchar.t + | Non_lowercase_delimiter of string + | Capitalized_raw_identifier of string exception Error of error * Location.t @@ -228,10 +233,46 @@ let uchar_for_uchar_escape lexbuf = illegal_escape lexbuf (Printf.sprintf "%X is not a Unicode scalar value" cp) +let validate_encoding lexbuf raw_name = + match Utf8_lexeme.normalize raw_name with + | Error _ -> error lexbuf (Invalid_encoding raw_name) + | Ok name -> name + +let ident_for_extended lexbuf raw_name = + let name = validate_encoding lexbuf raw_name in + match Utf8_lexeme.validate_identifier name with + | Utf8_lexeme.Valid -> name + | Utf8_lexeme.Invalid_character u -> error lexbuf (Invalid_char_in_ident u) + | Utf8_lexeme.Invalid_beginning _ -> + assert false (* excluded by the regexps *) + +let validate_delim lexbuf raw_name = + let name = validate_encoding lexbuf raw_name in + if Utf8_lexeme.is_lowercase name then name + else error lexbuf (Non_lowercase_delimiter name) + +let validate_ext lexbuf name = + let name = validate_encoding lexbuf name in + match Utf8_lexeme.validate_identifier ~with_dot:true name with + | Utf8_lexeme.Valid -> name + | Utf8_lexeme.Invalid_character u -> error lexbuf (Invalid_char_in_ident u) + | Utf8_lexeme.Invalid_beginning _ -> + assert false (* excluded by the regexps *) + +let lax_delim raw_name = + match Utf8_lexeme.normalize raw_name with + | Error _ -> None + | Ok name -> + if Utf8_lexeme.is_lowercase name then Some name + else None + let is_keyword name = Hashtbl.mem keyword_table name -let check_label_name lexbuf name = - if is_keyword name then error lexbuf (Keyword_as_label name) +let check_label_name ?(raw_escape=false) lexbuf name = + if Utf8_lexeme.is_capitalized name then + error lexbuf (Capitalized_label name); + if not raw_escape && is_keyword name then + error lexbuf (Keyword_as_label name) (* To "unlex" a few characters *) let set_lexeme_length buf n = ( @@ -282,13 +323,6 @@ let preprocessor = ref None let escaped_newlines = ref false -(* Warn about Latin-1 characters used in idents *) - -let warn_latin1 lexbuf = - Location.deprecated - (Location.curr lexbuf) - "ISO-Latin1 characters in identifiers" - type comment = [ `Comment of string | `Docstring of string ] let handle_docstrings = ref true @@ -340,6 +374,10 @@ let prepare_error loc = function | Keyword_as_label kwd -> Location.errorf ~loc "%a is a keyword, it cannot be used as label name" Style.inline_code kwd + | Capitalized_label lbl -> + Location.errorf ~loc + "%a cannot be used as label name, \ + it must start with a lowercase letter" Style.inline_code lbl | Invalid_literal s -> Location.errorf ~loc "Invalid literal %s" s | Invalid_directive (dir, explanation) -> @@ -347,6 +385,20 @@ let prepare_error loc = function (fun ppf -> match explanation with | None -> () | Some expl -> fprintf ppf ": %s" expl) + | Invalid_encoding s -> + Location.errorf ~loc "Invalid encoding of identifier %s." s + | Invalid_char_in_ident u -> + Location.errorf ~loc "Invalid character U+%X in identifier" + (Uchar.to_int u) + | Capitalized_raw_identifier lbl -> + Location.errorf ~loc + "%a cannot be used as a raw identifier, \ + it must start with a lowercase letter" Style.inline_code lbl + | Non_lowercase_delimiter name -> + Location.errorf ~loc + "%a cannot be used as a quoted string delimiter,@ \ + it must contain only lowercase letters." + Style.inline_code name let () = Location.register_error_of_exn @@ -363,12 +415,11 @@ let newline = ('\013'* '\010') let blank = [' ' '\009' '\012'] let lowercase = ['a'-'z' '_'] let uppercase = ['A'-'Z'] +let identstart = lowercase | uppercase let identchar = ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9'] -let lowercase_latin1 = ['a'-'z' '\223'-'\246' '\248'-'\255' '_'] -let uppercase_latin1 = ['A'-'Z' '\192'-'\214' '\216'-'\222'] -let identchar_latin1 = - ['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '\'' '0'-'9'] -(* This should be kept in sync with the [is_identchar] function in [env.ml] *) +let utf8 = ['\192'-'\255'] ['\128'-'\191']* +let identstart_ext = identstart | utf8 +let identchar_ext = identchar | utf8 let symbolchar = ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~'] @@ -380,7 +431,8 @@ let kwdopchar = ['$' '&' '*' '+' '-' '/' '<' '=' '>' '@' '^' '|'] let ident = (lowercase | uppercase) identchar* -let extattrident = ident ('.' ident)* +let ident_ext = identstart_ext identchar_ext* +let extattrident = ident_ext ('.' ident_ext)* let decimal_literal = ['0'-'9'] ['0'-'9' '_']* @@ -423,36 +475,40 @@ rule token = parse | ".~" { error lexbuf (Reserved_sequence (".~", Some "is reserved for use in MetaOCaml")) } - | "~" (raw_ident_escape lowercase identchar * as name) ':' - { LABEL name } - | "~" (lowercase identchar * as name) ':' + | "~" (identstart identchar * as name) ':' { check_label_name lexbuf name; LABEL name } - | "~" (lowercase_latin1 identchar_latin1 * as name) ':' - { warn_latin1 lexbuf; - LABEL name } + | "~" (raw_ident_escape? as escape) (ident_ext as raw_name) ':' + { let name = ident_for_extended lexbuf raw_name in + check_label_name ~raw_escape:(escape<>"") lexbuf name; + LABEL (escape ^ name) } | "?" { QUESTION } - | "?" (raw_ident_escape lowercase identchar * as name) ':' - { OPTLABEL name } | "?" (lowercase identchar * as name) ':' { check_label_name lexbuf name; OPTLABEL name } - | "?" (lowercase_latin1 identchar_latin1 * as name) ':' - { warn_latin1 lexbuf; - OPTLABEL name } - | (raw_ident_escape lowercase identchar * as name) - (* Compared to upstream, the raw_ident_escape is part of the lident. *) - { LIDENT name } + | "?" (raw_ident_escape? as escape) (ident_ext as raw_name) ':' + { let name = ident_for_extended lexbuf raw_name in + check_label_name ~raw_escape:(escape<>"") lexbuf name; + OPTLABEL (escape ^ name) + } | lowercase identchar * as name { try Hashtbl.find keyword_table name with Not_found -> LIDENT name } - | lowercase_latin1 identchar_latin1 * as name - { warn_latin1 lexbuf; LIDENT name } | uppercase identchar * as name { UIDENT name } (* No capitalized keywords *) - | uppercase_latin1 identchar_latin1 * as name - { warn_latin1 lexbuf; UIDENT name } + | (raw_ident_escape? as escape) (ident_ext as raw_name) + { let name = ident_for_extended lexbuf raw_name in + if Utf8_lexeme.is_capitalized name then begin + if escape="" then UIDENT name + else + (* we don't have capitalized keywords, and thus no needs for + capitalized raw identifiers. *) + error lexbuf (Capitalized_raw_identifier name) + end else + (* Compared to upstream, the raw_ident_escape is part of the lident. *) + LIDENT (escape ^ name) + } (* No non-ascii keywords *) | int_literal as lit { INT (lit, None) } | (int_literal as lit) (literal_modifier as modif) { INT (lit, Some modif) } @@ -465,26 +521,34 @@ rule token = parse | "\"" { let s, loc = wrap_string_lexer string lexbuf in STRING (s, loc, None) } - | "{" (lowercase* as delim) "|" - { let s, loc = wrap_string_lexer (quoted_string delim) lexbuf in - STRING (s, loc, Some delim) } - | "{%" (extattrident as id) "|" + | "{" (ident_ext? as raw_name) '|' + { let delim = validate_delim lexbuf raw_name in + let s, loc = wrap_string_lexer (quoted_string delim) lexbuf in + STRING (s, loc, Some delim) + } + | "{%" (extattrident as raw_id) "|" { let orig_loc = Location.curr lexbuf in + let id = validate_ext lexbuf raw_id in let s, loc = wrap_string_lexer (quoted_string "") lexbuf in let idloc = compute_quoted_string_idloc orig_loc 2 id in QUOTED_STRING_EXPR (id, idloc, s, loc, Some "") } - | "{%" (extattrident as id) blank+ (lowercase* as delim) "|" + | "{%" (extattrident as raw_id) blank+ (ident_ext as raw_delim) "|" { let orig_loc = Location.curr lexbuf in + let id = validate_ext lexbuf raw_id in + let delim = validate_delim lexbuf raw_delim in let s, loc = wrap_string_lexer (quoted_string delim) lexbuf in let idloc = compute_quoted_string_idloc orig_loc 2 id in QUOTED_STRING_EXPR (id, idloc, s, loc, Some delim) } - | "{%%" (extattrident as id) "|" + | "{%%" (extattrident as raw_id) "|" { let orig_loc = Location.curr lexbuf in + let id = validate_ext lexbuf raw_id in let s, loc = wrap_string_lexer (quoted_string "") lexbuf in let idloc = compute_quoted_string_idloc orig_loc 3 id in QUOTED_STRING_ITEM (id, idloc, s, loc, Some "") } - | "{%%" (extattrident as id) blank+ (lowercase* as delim) "|" + | "{%%" (extattrident as raw_id) blank+ (ident_ext as raw_delim) "|" { let orig_loc = Location.curr lexbuf in + let id = validate_ext lexbuf raw_id in + let delim = validate_delim lexbuf raw_delim in let s, loc = wrap_string_lexer (quoted_string delim) lexbuf in let idloc = compute_quoted_string_idloc orig_loc 3 id in QUOTED_STRING_ITEM (id, idloc, s, loc, Some delim) } @@ -664,8 +728,10 @@ and comment = parse is_in_string := false; store_string_char '\"'; comment lexbuf } - | "{" ('%' '%'? extattrident blank*)? (lowercase* as delim) "|" - { + | "{" ('%' '%'? extattrident blank*)? (ident_ext? as raw_delim) "|" + { match lax_delim raw_delim with + | None -> store_lexeme lexbuf; comment lexbuf + | Some delim -> string_start_loc := Location.curr lexbuf; store_lexeme lexbuf; is_in_string := true; @@ -775,8 +841,9 @@ and quoted_string delim = parse | eof { is_in_string := false; error_loc !string_start_loc Unterminated_string } - | "|" (lowercase* as edelim) "}" + | "|" (ident_ext? as raw_edelim) "}" { + let edelim = validate_encoding lexbuf raw_edelim in if delim = edelim then lexbuf.lex_start_p else (store_lexeme lexbuf; quoted_string delim lexbuf) } diff --git a/vendor/parser-shims/misc_.ml b/vendor/parser-shims/misc_.ml new file mode 100644 index 0000000000..e8bf50687d --- /dev/null +++ b/vendor/parser-shims/misc_.ml @@ -0,0 +1,393 @@ +(** {1 Minimal support for Unicode characters in identifiers} *) + +include Misc + +module Color = struct + include Color + + external isatty : out_channel -> bool = "caml_sys_isatty" + + (* reasonable heuristic on whether colors should be enabled *) + let should_enable_color () = + let term = try Sys.getenv "TERM" with Not_found -> "" in + term <> "dumb" + && term <> "" + && isatty stderr + + let default_setting = Auto + let enabled = ref true +end + +module Error_style = struct + include Error_style + + let default_setting = Contextual +end + +(* Terminal styling handling *) +module Style = struct + (* use ANSI color codes, see https://en.wikipedia.org/wiki/ANSI_escape_code *) + type color = + | Black + | Red + | Green + | Yellow + | Blue + | Magenta + | Cyan + | White + + type style = + | FG of color (* foreground *) + | BG of color (* background *) + | Bold + | Reset + + let ansi_of_color = function + | Black -> "0" + | Red -> "1" + | Green -> "2" + | Yellow -> "3" + | Blue -> "4" + | Magenta -> "5" + | Cyan -> "6" + | White -> "7" + + let code_of_style = function + | FG c -> "3" ^ ansi_of_color c + | BG c -> "4" ^ ansi_of_color c + | Bold -> "1" + | Reset -> "0" + + let ansi_of_style_l l = + let s = match l with + | [] -> code_of_style Reset + | [s] -> code_of_style s + | _ -> String.concat ";" (List.map code_of_style l) + in + "\x1b[" ^ s ^ "m" + + type Format.stag += Style of style list + + type tag_style ={ + ansi: style list; + text_open:string; + text_close:string + } + + type styles = { + error: tag_style; + warning: tag_style; + loc: tag_style; + hint: tag_style; + inline_code: tag_style; + } + + let no_markup stl = { ansi = stl; text_close = ""; text_open = "" } + + let default_styles = { + warning = no_markup [Bold; FG Magenta]; + error = no_markup [Bold; FG Red]; + loc = no_markup [Bold]; + hint = no_markup [Bold; FG Blue]; + inline_code= { ansi=[Bold]; text_open = {|"|}; text_close = {|"|} } + } + + let cur_styles = ref default_styles + let get_styles () = !cur_styles + let set_styles s = cur_styles := s + + (* map a tag to a style, if the tag is known. + @raise Not_found otherwise *) + let style_of_tag s = match s with + | Format.String_tag "error" -> (!cur_styles).error + | Format.String_tag "warning" ->(!cur_styles).warning + | Format.String_tag "loc" -> (!cur_styles).loc + | Format.String_tag "hint" -> (!cur_styles).hint + | Format.String_tag "inline_code" -> (!cur_styles).inline_code + | Style s -> no_markup s + | _ -> raise Not_found + + + let as_inline_code printer ppf x = + let open Format_doc in + pp_open_stag ppf (Format.String_tag "inline_code"); + printer ppf x; + pp_close_stag ppf () + + let inline_code ppf s = as_inline_code Format_doc.pp_print_string ppf s + + (* either prints the tag of [s] or delegates to [or_else] *) + let mark_open_tag ~or_else s = + try + let style = style_of_tag s in + if !Color.enabled then ansi_of_style_l style.ansi else style.text_open + with Not_found -> or_else s + + let mark_close_tag ~or_else s = + try + let style = style_of_tag s in + if !Color.enabled then ansi_of_style_l [Reset] else style.text_close + with Not_found -> or_else s + + (* add tag handling to formatter [ppf] *) + let set_tag_handling ppf = + let open Format in + let functions = pp_get_formatter_stag_functions ppf () in + let functions' = {functions with + mark_open_stag=(mark_open_tag ~or_else:functions.mark_open_stag); + mark_close_stag=(mark_close_tag ~or_else:functions.mark_close_stag); + } in + pp_set_mark_tags ppf true; (* enable tags *) + pp_set_formatter_stag_functions ppf functions'; + () + + let setup = + let first = ref true in (* initialize only once *) + let formatter_l = + [Format.std_formatter; Format.err_formatter; Format.str_formatter] + in + let enable_color = function + | Color.Auto -> Color.should_enable_color () + | Color.Always -> true + | Color.Never -> false + in + fun o -> + if !first then ( + first := false; + Format.set_mark_tags true; + List.iter set_tag_handling formatter_l; + Color.enabled := (match o with + | Some s -> enable_color s + | None -> enable_color Color.default_setting) + ); + () +end + +module Utf8_lexeme = struct + + type t = string + + (* Non-ASCII letters that are allowed in identifiers (currently: Latin-9) *) + + type case = Upper of Uchar.t | Lower of Uchar.t + let known_chars : (Uchar.t, case) Hashtbl.t = Hashtbl.create 32 + + let _ = + List.iter + (fun (upper, lower) -> + let upper = Uchar.of_int upper and lower = Uchar.of_int lower in + Hashtbl.add known_chars upper (Upper lower); + Hashtbl.add known_chars lower (Lower upper)) + [ + (0xc0, 0xe0); (* À, à *) (0xc1, 0xe1); (* Á, á *) + (0xc2, 0xe2); (* Â, â *) (0xc3, 0xe3); (* Ã, ã *) + (0xc4, 0xe4); (* Ä, ä *) (0xc5, 0xe5); (* Å, å *) + (0xc6, 0xe6); (* Æ, æ *) (0xc7, 0xe7); (* Ç, ç *) + (0xc8, 0xe8); (* È, è *) (0xc9, 0xe9); (* É, é *) + (0xca, 0xea); (* Ê, ê *) (0xcb, 0xeb); (* Ë, ë *) + (0xcc, 0xec); (* Ì, ì *) (0xcd, 0xed); (* Í, í *) + (0xce, 0xee); (* Î, î *) (0xcf, 0xef); (* Ï, ï *) + (0xd0, 0xf0); (* Ð, ð *) (0xd1, 0xf1); (* Ñ, ñ *) + (0xd2, 0xf2); (* Ò, ò *) (0xd3, 0xf3); (* Ó, ó *) + (0xd4, 0xf4); (* Ô, ô *) (0xd5, 0xf5); (* Õ, õ *) + (0xd6, 0xf6); (* Ö, ö *) (0xd8, 0xf8); (* Ø, ø *) + (0xd9, 0xf9); (* Ù, ù *) (0xda, 0xfa); (* Ú, ú *) + (0xdb, 0xfb); (* Û, û *) (0xdc, 0xfc); (* Ü, ü *) + (0xdd, 0xfd); (* Ý, ý *) (0xde, 0xfe); (* Þ, þ *) + (0x160, 0x161); (* Š, š *) (0x17d, 0x17e); (* Ž, ž *) + (0x152, 0x153); (* Œ, œ *) (0x178, 0xff); (* Ÿ, ÿ *) + (0x1e9e, 0xdf); (* ẞ, ß *) + ] + + (* NFD to NFC conversion table for the letters above *) + + let known_pairs : (Uchar.t * Uchar.t, Uchar.t) Hashtbl.t = Hashtbl.create 32 + + let _ = + List.iter + (fun (c1, n2, n) -> + Hashtbl.add known_pairs + (Uchar.of_char c1, Uchar.of_int n2) (Uchar.of_int n)) + [ + ('A', 0x300, 0xc0); (* À *) ('A', 0x301, 0xc1); (* Á *) + ('A', 0x302, 0xc2); (*  *) ('A', 0x303, 0xc3); (* à *) + ('A', 0x308, 0xc4); (* Ä *) ('A', 0x30a, 0xc5); (* Å *) + ('C', 0x327, 0xc7); (* Ç *) ('E', 0x300, 0xc8); (* È *) + ('E', 0x301, 0xc9); (* É *) ('E', 0x302, 0xca); (* Ê *) + ('E', 0x308, 0xcb); (* Ë *) ('I', 0x300, 0xcc); (* Ì *) + ('I', 0x301, 0xcd); (* Í *) ('I', 0x302, 0xce); (* Î *) + ('I', 0x308, 0xcf); (* Ï *) ('N', 0x303, 0xd1); (* Ñ *) + ('O', 0x300, 0xd2); (* Ò *) ('O', 0x301, 0xd3); (* Ó *) + ('O', 0x302, 0xd4); (* Ô *) ('O', 0x303, 0xd5); (* Õ *) + ('O', 0x308, 0xd6); (* Ö *) + ('U', 0x300, 0xd9); (* Ù *) ('U', 0x301, 0xda); (* Ú *) + ('U', 0x302, 0xdb); (* Û *) ('U', 0x308, 0xdc); (* Ü *) + ('Y', 0x301, 0xdd); (* Ý *) ('Y', 0x308, 0x178); (* Ÿ *) + ('S', 0x30c, 0x160); (* Š *) ('Z', 0x30c, 0x17d); (* Ž *) + ('a', 0x300, 0xe0); (* à *) ('a', 0x301, 0xe1); (* á *) + ('a', 0x302, 0xe2); (* â *) ('a', 0x303, 0xe3); (* ã *) + ('a', 0x308, 0xe4); (* ä *) ('a', 0x30a, 0xe5); (* å *) + ('c', 0x327, 0xe7); (* ç *) ('e', 0x300, 0xe8); (* è *) + ('e', 0x301, 0xe9); (* é *) ('e', 0x302, 0xea); (* ê *) + ('e', 0x308, 0xeb); (* ë *) ('i', 0x300, 0xec); (* ì *) + ('i', 0x301, 0xed); (* í *) ('i', 0x302, 0xee); (* î *) + ('i', 0x308, 0xef); (* ï *) ('n', 0x303, 0xf1); (* ñ *) + ('o', 0x300, 0xf2); (* ò *) ('o', 0x301, 0xf3); (* ó *) + ('o', 0x302, 0xf4); (* ô *) ('o', 0x303, 0xf5); (* õ *) + ('o', 0x308, 0xf6); (* ö *) + ('u', 0x300, 0xf9); (* ù *) ('u', 0x301, 0xfa); (* ú *) + ('u', 0x302, 0xfb); (* û *) ('u', 0x308, 0xfc); (* ü *) + ('y', 0x301, 0xfd); (* ý *) ('y', 0x308, 0xff); (* ÿ *) + ('s', 0x30c, 0x161); (* š *) ('z', 0x30c, 0x17e); (* ž *) + ] + + let normalize_generic ~keep_ascii transform s = + let rec norm check buf prev i = + if i >= String.length s then begin + Buffer.add_utf_8_uchar buf (transform prev) + end else begin + let d = String.get_utf_8_uchar s i in + let u = Uchar.utf_decode_uchar d in + check d u; + let i' = i + Uchar.utf_decode_length d in + match Hashtbl.find_opt known_pairs (prev, u) with + | Some u' -> + norm check buf u' i' + | None -> + Buffer.add_utf_8_uchar buf (transform prev); + norm check buf u i' + end in + let ascii_limit = 128 in + if s = "" + || keep_ascii && String.for_all (fun x -> Char.code x < ascii_limit) s + then Ok s + else + let buf = Buffer.create (String.length s) in + let valid = ref true in + let check d u = + valid := !valid && Uchar.utf_decode_is_valid d && u <> Uchar.rep + in + let d = String.get_utf_8_uchar s 0 in + let u = Uchar.utf_decode_uchar d in + check d u; + norm check buf u (Uchar.utf_decode_length d); + let contents = Buffer.contents buf in + if !valid then + Ok contents + else + Error contents + + let normalize s = + normalize_generic ~keep_ascii:true (fun u -> u) s + + (* Capitalization *) + + let uchar_is_uppercase u = + let c = Uchar.to_int u in + if c < 0x80 then c >= 65 && c <= 90 else + match Hashtbl.find_opt known_chars u with + | Some(Upper _) -> true + | _ -> false + + let uchar_lowercase u = + let c = Uchar.to_int u in + if c < 0x80 then + if c >= 65 && c <= 90 then Uchar.of_int (c + 32) else u + else + match Hashtbl.find_opt known_chars u with + | Some(Upper u') -> u' + | _ -> u + + let uchar_uppercase u = + let c = Uchar.to_int u in + if c < 0x80 then + if c >= 97 && c <= 122 then Uchar.of_int (c - 32) else u + else + match Hashtbl.find_opt known_chars u with + | Some(Lower u') -> u' + | _ -> u + + let capitalize s = + let first = ref true in + normalize_generic ~keep_ascii:false + (fun u -> if !first then (first := false; uchar_uppercase u) else u) + s + + let uncapitalize s = + let first = ref true in + normalize_generic ~keep_ascii:false + (fun u -> if !first then (first := false; uchar_lowercase u) else u) + s + + let is_capitalized s = + s <> "" && + uchar_is_uppercase (Uchar.utf_decode_uchar (String.get_utf_8_uchar s 0)) + + (* Characters allowed in identifiers after normalization is applied. + Currently: + - ASCII letters, underscore + - Latin-9 letters, represented in NFC + - ASCII digits, single quote (but not as first character) + - dot if [with_dot] = true + *) + let uchar_valid_in_identifier ~with_dot u = + let c = Uchar.to_int u in + if c < 0x80 then + c >= 97 (* a *) && c <= 122 (* z *) + || c >= 65 (* A *) && c <= 90 (* Z *) + || c >= 48 (* 0 *) && c <= 57 (* 9 *) + || c = 95 (* underscore *) + || c = 39 (* single quote *) + || (with_dot && c = 46) (* dot *) + else + Hashtbl.mem known_chars u + + let uchar_not_identifier_start u = + let c = Uchar.to_int u in + c >= 48 (* 0 *) && c <= 57 (* 9 *) + || c = 39 (* single quote *) + + (* Check whether a normalized string is a valid OCaml identifier. *) + + type validation_result = + | Valid + | Invalid_character of Uchar.t (** Character not allowed *) + | Invalid_beginning of Uchar.t (** Character not allowed as first char *) + + let validate_identifier ?(with_dot=false) s = + let rec check i = + if i >= String.length s then Valid else begin + let d = String.get_utf_8_uchar s i in + let u = Uchar.utf_decode_uchar d in + let i' = i + Uchar.utf_decode_length d in + if not (uchar_valid_in_identifier ~with_dot u) then + Invalid_character u + else if i = 0 && uchar_not_identifier_start u then + Invalid_beginning u + else + check i' + end + in check 0 + + let is_valid_identifier s = + validate_identifier s = Valid + + let starts_like_a_valid_identifier s = + s <> "" && + (let u = Uchar.utf_decode_uchar (String.get_utf_8_uchar s 0) in + uchar_valid_in_identifier ~with_dot:false u + && not (uchar_not_identifier_start u)) + + let is_lowercase s = + let rec is_lowercase_at len s n = + if n >= len then true + else + let d = String.get_utf_8_uchar s n in + let u = Uchar.utf_decode_uchar d in + (uchar_valid_in_identifier ~with_dot:false u) + && not (uchar_is_uppercase u) + && is_lowercase_at len s (n+Uchar.utf_decode_length d) + in + is_lowercase_at (String.length s) s 0 +end diff --git a/vendor/parser-shims/ocamlformat_parser_shims.ml b/vendor/parser-shims/ocamlformat_parser_shims.ml index b9d3f34885..1420bf7ba3 100644 --- a/vendor/parser-shims/ocamlformat_parser_shims.ml +++ b/vendor/parser-shims/ocamlformat_parser_shims.ml @@ -2,171 +2,7 @@ shims. *) include Ocamlformat_stdlib_shims -module Misc = struct - include Misc - - module Color = struct - include Color - - external isatty : out_channel -> bool = "caml_sys_isatty" - - (* reasonable heuristic on whether colors should be enabled *) - let should_enable_color () = - let term = try Sys.getenv "TERM" with Not_found -> "" in - term <> "dumb" - && term <> "" - && isatty stderr - - let default_setting = Auto - let enabled = ref true - end - - module Error_style = struct - include Error_style - - let default_setting = Contextual - end - - (* Terminal styling handling *) - module Style = struct - (* use ANSI color codes, see https://en.wikipedia.org/wiki/ANSI_escape_code *) - type color = - | Black - | Red - | Green - | Yellow - | Blue - | Magenta - | Cyan - | White - - type style = - | FG of color (* foreground *) - | BG of color (* background *) - | Bold - | Reset - - let ansi_of_color = function - | Black -> "0" - | Red -> "1" - | Green -> "2" - | Yellow -> "3" - | Blue -> "4" - | Magenta -> "5" - | Cyan -> "6" - | White -> "7" - - let code_of_style = function - | FG c -> "3" ^ ansi_of_color c - | BG c -> "4" ^ ansi_of_color c - | Bold -> "1" - | Reset -> "0" - - let ansi_of_style_l l = - let s = match l with - | [] -> code_of_style Reset - | [s] -> code_of_style s - | _ -> String.concat ";" (List.map code_of_style l) - in - "\x1b[" ^ s ^ "m" - - type Format.stag += Style of style list - - type tag_style ={ - ansi: style list; - text_open:string; - text_close:string - } - - type styles = { - error: tag_style; - warning: tag_style; - loc: tag_style; - hint: tag_style; - inline_code: tag_style; - } - - let no_markup stl = { ansi = stl; text_close = ""; text_open = "" } - - let default_styles = { - warning = no_markup [Bold; FG Magenta]; - error = no_markup [Bold; FG Red]; - loc = no_markup [Bold]; - hint = no_markup [Bold; FG Blue]; - inline_code= { ansi=[Bold]; text_open = {|"|}; text_close = {|"|} } - } - - let cur_styles = ref default_styles - let get_styles () = !cur_styles - let set_styles s = cur_styles := s - - (* map a tag to a style, if the tag is known. - @raise Not_found otherwise *) - let style_of_tag s = match s with - | Format.String_tag "error" -> (!cur_styles).error - | Format.String_tag "warning" ->(!cur_styles).warning - | Format.String_tag "loc" -> (!cur_styles).loc - | Format.String_tag "hint" -> (!cur_styles).hint - | Format.String_tag "inline_code" -> (!cur_styles).inline_code - | Style s -> no_markup s - | _ -> raise Not_found - - - let as_inline_code printer ppf x = - let open Format_doc in - pp_open_stag ppf (Format.String_tag "inline_code"); - printer ppf x; - pp_close_stag ppf () - - let inline_code ppf s = as_inline_code Format_doc.pp_print_string ppf s - - (* either prints the tag of [s] or delegates to [or_else] *) - let mark_open_tag ~or_else s = - try - let style = style_of_tag s in - if !Color.enabled then ansi_of_style_l style.ansi else style.text_open - with Not_found -> or_else s - - let mark_close_tag ~or_else s = - try - let style = style_of_tag s in - if !Color.enabled then ansi_of_style_l [Reset] else style.text_close - with Not_found -> or_else s - - (* add tag handling to formatter [ppf] *) - let set_tag_handling ppf = - let open Format in - let functions = pp_get_formatter_stag_functions ppf () in - let functions' = {functions with - mark_open_stag=(mark_open_tag ~or_else:functions.mark_open_stag); - mark_close_stag=(mark_close_tag ~or_else:functions.mark_close_stag); - } in - pp_set_mark_tags ppf true; (* enable tags *) - pp_set_formatter_stag_functions ppf functions'; - () - - let setup = - let first = ref true in (* initialize only once *) - let formatter_l = - [Format.std_formatter; Format.err_formatter; Format.str_formatter] - in - let enable_color = function - | Color.Auto -> Color.should_enable_color () - | Color.Always -> true - | Color.Never -> false - in - fun o -> - if !first then ( - first := false; - Format.set_mark_tags true; - List.iter set_tag_handling formatter_l; - Color.enabled := (match o with - | Some s -> enable_color s - | None -> enable_color Color.default_setting) - ); - () - end -end +module Misc = Misc_ module Clflags : sig val include_dirs : string list ref diff --git a/vendor/parser-shims/stdlib_shims/ocamlformat_stdlib_shims.ml b/vendor/parser-shims/stdlib_shims/ocamlformat_stdlib_shims.ml index 3dbd00f7ae..155512c609 100644 --- a/vendor/parser-shims/stdlib_shims/ocamlformat_stdlib_shims.ml +++ b/vendor/parser-shims/stdlib_shims/ocamlformat_stdlib_shims.ml @@ -20,3 +20,118 @@ end module Either = struct type ('a, 'b) t = Left of 'a | Right of 'b end + +module Uchar = struct + include Uchar + + let rep_ = 0xFFFD + let rep = unsafe_of_int rep_ + let valid_bit = 27 + let decode_bits = 24 + let[@inline] utf_decode_is_valid d = (d lsr valid_bit) = 1 + let[@inline] utf_decode_length d = (d lsr decode_bits) land 0b111 + let[@inline] utf_decode_uchar d = unsafe_of_int (d land 0xFFFFFF) + let[@inline] utf_decode n u = ((8 lor n) lsl decode_bits) lor (to_int u) + let[@inline] utf_decode_invalid n = (n lsl decode_bits) lor rep_ +end + +module Bytes = struct + include Bytes + + external unsafe_get_uint8 : bytes -> int -> int = "%bytes_unsafe_get" + let[@inline] not_in_x80_to_xBF b = b lsr 6 <> 0b10 + let[@inline] not_in_xA0_to_xBF b = b lsr 5 <> 0b101 + let[@inline] not_in_x80_to_x9F b = b lsr 5 <> 0b100 + let[@inline] not_in_x90_to_xBF b = b < 0x90 || 0xBF < b + let[@inline] not_in_x80_to_x8F b = b lsr 4 <> 0x8 + let dec_invalid = Uchar.utf_decode_invalid + let[@inline] dec_ret n u = Uchar.utf_decode n (Uchar.unsafe_of_int u) + + let[@inline] utf_8_uchar_2 b0 b1 = + ((b0 land 0x1F) lsl 6) lor + ((b1 land 0x3F)) + + let[@inline] utf_8_uchar_3 b0 b1 b2 = + ((b0 land 0x0F) lsl 12) lor + ((b1 land 0x3F) lsl 6) lor + ((b2 land 0x3F)) + + let[@inline] utf_8_uchar_4 b0 b1 b2 b3 = + ((b0 land 0x07) lsl 18) lor + ((b1 land 0x3F) lsl 12) lor + ((b2 land 0x3F) lsl 6) lor + ((b3 land 0x3F)) + + let get_utf_8_uchar b i = + let b0 = get_uint8 b i in (* raises if [i] is not a valid index. *) + let get = unsafe_get_uint8 in + let max = length b - 1 in + match Char.unsafe_chr b0 with (* See The Unicode Standard, Table 3.7 *) + | '\x00' .. '\x7F' -> dec_ret 1 b0 + | '\xC2' .. '\xDF' -> + let i = i + 1 in if i > max then dec_invalid 1 else + let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else + dec_ret 2 (utf_8_uchar_2 b0 b1) + | '\xE0' -> + let i = i + 1 in if i > max then dec_invalid 1 else + let b1 = get b i in if not_in_xA0_to_xBF b1 then dec_invalid 1 else + let i = i + 1 in if i > max then dec_invalid 2 else + let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else + dec_ret 3 (utf_8_uchar_3 b0 b1 b2) + | '\xE1' .. '\xEC' | '\xEE' .. '\xEF' -> + let i = i + 1 in if i > max then dec_invalid 1 else + let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else + let i = i + 1 in if i > max then dec_invalid 2 else + let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else + dec_ret 3 (utf_8_uchar_3 b0 b1 b2) + | '\xED' -> + let i = i + 1 in if i > max then dec_invalid 1 else + let b1 = get b i in if not_in_x80_to_x9F b1 then dec_invalid 1 else + let i = i + 1 in if i > max then dec_invalid 2 else + let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else + dec_ret 3 (utf_8_uchar_3 b0 b1 b2) + | '\xF0' -> + let i = i + 1 in if i > max then dec_invalid 1 else + let b1 = get b i in if not_in_x90_to_xBF b1 then dec_invalid 1 else + let i = i + 1 in if i > max then dec_invalid 2 else + let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else + let i = i + 1 in if i > max then dec_invalid 3 else + let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else + dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3) + | '\xF1' .. '\xF3' -> + let i = i + 1 in if i > max then dec_invalid 1 else + let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else + let i = i + 1 in if i > max then dec_invalid 2 else + let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else + let i = i + 1 in if i > max then dec_invalid 3 else + let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else + dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3) + | '\xF4' -> + let i = i + 1 in if i > max then dec_invalid 1 else + let b1 = get b i in if not_in_x80_to_x8F b1 then dec_invalid 1 else + let i = i + 1 in if i > max then dec_invalid 2 else + let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else + let i = i + 1 in if i > max then dec_invalid 3 else + let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else + dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3) + | _ -> dec_invalid 1 + + let for_all p s = + let n = length s in + let rec loop i = + if i = n then true + else if p (unsafe_get s i) then loop (succ i) + else false in + loop 0 +end + +module String = struct + include String + module B = Bytes + + let bos = B.unsafe_of_string + let get_utf_8_uchar s i = B.get_utf_8_uchar (bos s) i + + let for_all f s = + B.for_all f (bos s) +end diff --git a/vendor/parser-standard/lexer.mll b/vendor/parser-standard/lexer.mll index 7bdeadcd00..5fad8bc9b4 100644 --- a/vendor/parser-standard/lexer.mll +++ b/vendor/parser-standard/lexer.mll @@ -29,8 +29,13 @@ type error = | Unterminated_string_in_comment of Location.t * Location.t | Empty_character_literal | Keyword_as_label of string + | Capitalized_label of string | Invalid_literal of string | Invalid_directive of string * string option + | Invalid_encoding of string + | Invalid_char_in_ident of Uchar.t + | Non_lowercase_delimiter of string + | Capitalized_raw_identifier of string exception Error of error * Location.t @@ -255,10 +260,46 @@ let uchar_for_uchar_escape lexbuf = illegal_escape lexbuf (Printf.sprintf "%X is not a Unicode scalar value" cp) +let validate_encoding lexbuf raw_name = + match Utf8_lexeme.normalize raw_name with + | Error _ -> error lexbuf (Invalid_encoding raw_name) + | Ok name -> name + +let ident_for_extended lexbuf raw_name = + let name = validate_encoding lexbuf raw_name in + match Utf8_lexeme.validate_identifier name with + | Utf8_lexeme.Valid -> name + | Utf8_lexeme.Invalid_character u -> error lexbuf (Invalid_char_in_ident u) + | Utf8_lexeme.Invalid_beginning _ -> + assert false (* excluded by the regexps *) + +let validate_delim lexbuf raw_name = + let name = validate_encoding lexbuf raw_name in + if Utf8_lexeme.is_lowercase name then name + else error lexbuf (Non_lowercase_delimiter name) + +let validate_ext lexbuf name = + let name = validate_encoding lexbuf name in + match Utf8_lexeme.validate_identifier ~with_dot:true name with + | Utf8_lexeme.Valid -> name + | Utf8_lexeme.Invalid_character u -> error lexbuf (Invalid_char_in_ident u) + | Utf8_lexeme.Invalid_beginning _ -> + assert false (* excluded by the regexps *) + +let lax_delim raw_name = + match Utf8_lexeme.normalize raw_name with + | Error _ -> None + | Ok name -> + if Utf8_lexeme.is_lowercase name then Some name + else None + let is_keyword name = Hashtbl.mem keyword_table name -let check_label_name lexbuf name = - if is_keyword name then error lexbuf (Keyword_as_label name) +let check_label_name ?(raw_escape=false) lexbuf name = + if Utf8_lexeme.is_capitalized name then + error lexbuf (Capitalized_label name); + if not raw_escape && is_keyword name then + error lexbuf (Keyword_as_label name) (* To "unlex" a few characters *) let set_lexeme_length buf n = ( @@ -309,13 +350,6 @@ let preprocessor = ref None let escaped_newlines = ref false -(* Warn about Latin-1 characters used in idents *) - -let warn_latin1 lexbuf = - Location.deprecated - (Location.curr lexbuf) - "ISO-Latin1 characters in identifiers" - let handle_docstrings = ref true let comment_list = ref [] @@ -366,6 +400,10 @@ let prepare_error loc = function | Keyword_as_label kwd -> Location.errorf ~loc "%a is a keyword, it cannot be used as label name" Style.inline_code kwd + | Capitalized_label lbl -> + Location.errorf ~loc + "%a cannot be used as label name, \ + it must start with a lowercase letter" Style.inline_code lbl | Invalid_literal s -> Location.errorf ~loc "Invalid literal %s" s | Invalid_directive (dir, explanation) -> @@ -373,6 +411,20 @@ let prepare_error loc = function (fun ppf -> match explanation with | None -> () | Some expl -> fprintf ppf ": %s" expl) + | Invalid_encoding s -> + Location.errorf ~loc "Invalid encoding of identifier %s." s + | Invalid_char_in_ident u -> + Location.errorf ~loc "Invalid character U+%X in identifier" + (Uchar.to_int u) + | Capitalized_raw_identifier lbl -> + Location.errorf ~loc + "%a cannot be used as a raw identifier, \ + it must start with a lowercase letter" Style.inline_code lbl + | Non_lowercase_delimiter name -> + Location.errorf ~loc + "%a cannot be used as a quoted string delimiter,@ \ + it must contain only lowercase letters." + Style.inline_code name let () = Location.register_error_of_exn @@ -389,12 +441,11 @@ let newline = ('\013'* '\010') let blank = [' ' '\009' '\012'] let lowercase = ['a'-'z' '_'] let uppercase = ['A'-'Z'] +let identstart = lowercase | uppercase let identchar = ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9'] -let lowercase_latin1 = ['a'-'z' '\223'-'\246' '\248'-'\255' '_'] -let uppercase_latin1 = ['A'-'Z' '\192'-'\214' '\216'-'\222'] -let identchar_latin1 = - ['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '\'' '0'-'9'] -(* This should be kept in sync with the [is_identchar] function in [env.ml] *) +let utf8 = ['\192'-'\255'] ['\128'-'\191']* +let identstart_ext = identstart | utf8 +let identchar_ext = identchar | utf8 let symbolchar = ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~'] @@ -406,7 +457,8 @@ let kwdopchar = ['$' '&' '*' '+' '-' '/' '<' '=' '>' '@' '^' '|'] let ident = (lowercase | uppercase) identchar* -let extattrident = ident ('.' ident)* +let ident_ext = identstart_ext identchar_ext* +let extattrident = ident_ext ('.' ident_ext)* let decimal_literal = ['0'-'9'] ['0'-'9' '_']* @@ -449,35 +501,39 @@ rule token = parse | ".~" { error lexbuf (Reserved_sequence (".~", Some "is reserved for use in MetaOCaml")) } - | "~" raw_ident_escape (lowercase identchar * as name) ':' - { LABEL name } - | "~" (lowercase identchar * as name) ':' + | "~" (identstart identchar * as name) ':' { check_label_name lexbuf name; LABEL name } - | "~" (lowercase_latin1 identchar_latin1 * as name) ':' - { warn_latin1 lexbuf; + | "~" (raw_ident_escape? as escape) (ident_ext as raw_name) ':' + { let name = ident_for_extended lexbuf raw_name in + check_label_name ~raw_escape:(escape<>"") lexbuf name; LABEL name } | "?" { QUESTION } - | "?" raw_ident_escape (lowercase identchar * as name) ':' - { OPTLABEL name } | "?" (lowercase identchar * as name) ':' { check_label_name lexbuf name; OPTLABEL name } - | "?" (lowercase_latin1 identchar_latin1 * as name) ':' - { warn_latin1 lexbuf; - OPTLABEL name } - | raw_ident_escape (lowercase identchar * as name) - { LIDENT name } + | "?" (raw_ident_escape? as escape) (ident_ext as raw_name) ':' + { let name = ident_for_extended lexbuf raw_name in + check_label_name ~raw_escape:(escape<>"") lexbuf name; + OPTLABEL name + } | lowercase identchar * as name { try Hashtbl.find keyword_table name with Not_found -> LIDENT name } - | lowercase_latin1 identchar_latin1 * as name - { warn_latin1 lexbuf; LIDENT name } | uppercase identchar * as name { UIDENT name } (* No capitalized keywords *) - | uppercase_latin1 identchar_latin1 * as name - { warn_latin1 lexbuf; UIDENT name } + | (raw_ident_escape? as escape) (ident_ext as raw_name) + { let name = ident_for_extended lexbuf raw_name in + if Utf8_lexeme.is_capitalized name then begin + if escape="" then UIDENT name + else + (* we don't have capitalized keywords, and thus no needs for + capitalized raw identifiers. *) + error lexbuf (Capitalized_raw_identifier name) + end else + LIDENT name + } (* No non-ascii keywords *) | int_literal as lit { INT (lit, None) } | (int_literal as lit) (literal_modifier as modif) { INT (lit, Some modif) } @@ -490,26 +546,34 @@ rule token = parse | "\"" { let s, loc = wrap_string_lexer string lexbuf in STRING (s, loc, None) } - | "{" (lowercase* as delim) "|" - { let s, loc = wrap_string_lexer (quoted_string delim) lexbuf in - STRING (s, loc, Some delim) } - | "{%" (extattrident as id) "|" + | "{" (ident_ext? as raw_name) '|' + { let delim = validate_delim lexbuf raw_name in + let s, loc = wrap_string_lexer (quoted_string delim) lexbuf in + STRING (s, loc, Some delim) + } + | "{%" (extattrident as raw_id) "|" { let orig_loc = Location.curr lexbuf in + let id = validate_ext lexbuf raw_id in let s, loc = wrap_string_lexer (quoted_string "") lexbuf in let idloc = compute_quoted_string_idloc orig_loc 2 id in QUOTED_STRING_EXPR (id, idloc, s, loc, Some "") } - | "{%" (extattrident as id) blank+ (lowercase* as delim) "|" + | "{%" (extattrident as raw_id) blank+ (ident_ext as raw_delim) "|" { let orig_loc = Location.curr lexbuf in + let id = validate_ext lexbuf raw_id in + let delim = validate_delim lexbuf raw_delim in let s, loc = wrap_string_lexer (quoted_string delim) lexbuf in let idloc = compute_quoted_string_idloc orig_loc 2 id in QUOTED_STRING_EXPR (id, idloc, s, loc, Some delim) } - | "{%%" (extattrident as id) "|" + | "{%%" (extattrident as raw_id) "|" { let orig_loc = Location.curr lexbuf in + let id = validate_ext lexbuf raw_id in let s, loc = wrap_string_lexer (quoted_string "") lexbuf in let idloc = compute_quoted_string_idloc orig_loc 3 id in QUOTED_STRING_ITEM (id, idloc, s, loc, Some "") } - | "{%%" (extattrident as id) blank+ (lowercase* as delim) "|" + | "{%%" (extattrident as raw_id) blank+ (ident_ext as raw_delim) "|" { let orig_loc = Location.curr lexbuf in + let id = validate_ext lexbuf raw_id in + let delim = validate_delim lexbuf raw_delim in let s, loc = wrap_string_lexer (quoted_string delim) lexbuf in let idloc = compute_quoted_string_idloc orig_loc 3 id in QUOTED_STRING_ITEM (id, idloc, s, loc, Some delim) } @@ -689,8 +753,10 @@ and comment = parse is_in_string := false; store_string_char '\"'; comment lexbuf } - | "{" ('%' '%'? extattrident blank*)? (lowercase* as delim) "|" - { + | "{" ('%' '%'? extattrident blank*)? (ident_ext? as raw_delim) "|" + { match lax_delim raw_delim with + | None -> store_lexeme lexbuf; comment lexbuf + | Some delim -> string_start_loc := Location.curr lexbuf; store_lexeme lexbuf; is_in_string := true; @@ -804,8 +870,9 @@ and quoted_string delim = parse | eof { is_in_string := false; error_loc !string_start_loc Unterminated_string } - | "|" (lowercase* as edelim) "}" + | "|" (ident_ext? as raw_edelim) "}" { + let edelim = validate_encoding lexbuf raw_edelim in if delim = edelim then lexbuf.lex_start_p else (store_lexeme lexbuf; quoted_string delim lexbuf) }