From cdb009957dc19ec58a42aefd0bf3b5b189c22112 Mon Sep 17 00:00:00 2001
From: Jules Aguillon <jules@j3s.fr>
Date: Wed, 20 Nov 2024 15:13:47 +0100
Subject: [PATCH] 5.3 support: Backport utf8 identifiers (#2622)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Vendor required code from Misc.Utf8_lexeme, String, Bytes and Uchar.

Backport utf8 identifiers to standard and extended parser. This reduces the
diff with upstream.

* test: Add utf8_identifiers.ml

Taken from the compiler source.

* test_branch: Ignore parsing errors on 'infer'

The project contains a syntax error, that is new in OCaml 5.3:

    (** [x < min({x'|x'∊l})] *)
---
 CHANGES.md                                    |   5 +-
 test-extra/Makefile                           |   4 +-
 test/failing/tests/unit_lex.ml.broken-ref     |   9 +-
 test/failing/tests/unit_values.ml.broken-ref  |  11 +-
 test/passing/gen/dune.inc                     |  15 +
 .../refs.default/utf8_identifiers.ml.ref      |  18 +
 .../refs.janestreet/utf8_identifiers.ml.ref   |  19 +
 .../refs.ocamlformat/utf8_identifiers.ml.ref  |  18 +
 test/passing/tests/utf8_identifiers.ml        |  18 +
 vendor/parser-extended/lexer.mll              | 153 +++++--
 vendor/parser-shims/misc_.ml                  | 393 ++++++++++++++++++
 .../parser-shims/ocamlformat_parser_shims.ml  | 166 +-------
 .../stdlib_shims/ocamlformat_stdlib_shims.ml  | 115 +++++
 vendor/parser-standard/lexer.mll              | 149 +++++--
 14 files changed, 825 insertions(+), 268 deletions(-)
 create mode 100644 test/passing/refs.default/utf8_identifiers.ml.ref
 create mode 100644 test/passing/refs.janestreet/utf8_identifiers.ml.ref
 create mode 100644 test/passing/refs.ocamlformat/utf8_identifiers.ml.ref
 create mode 100644 test/passing/tests/utf8_identifiers.ml
 create mode 100644 vendor/parser-shims/misc_.ml
diff --git a/CHANGES.md b/CHANGES.md
index 7efb810f96..70a1788e76 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -14,8 +14,9 @@ profile. This started with version 0.26.0.
   This might change the formatting of some functions due to the formatting code
   being completely rewritten.
 
-- Support OCaml 5.3 syntax (#2609, #2610, #2611, @Julow)
-  This adds support for short functor type arguments syntax.
+- Support OCaml 5.3 syntax (#2609, #2610, #2611, #2622, @Julow)
+  This adds support for short functor type arguments syntax and utf8
+  identifiers.
 
 - Documentation comments are now formatted by default (#2390, @Julow)
   Use the option `parse-docstrings = false` to restore the previous behavior.
diff --git a/test-extra/Makefile b/test-extra/Makefile
index 7ddae35ca8..54ca068ddd 100644
--- a/test-extra/Makefile
+++ b/test-extra/Makefile
@@ -13,11 +13,11 @@
 #       make DIRS=<directory> test
 # By default, test projects used as regression tests
 DIRS= \
-	code/ocamlformat code/infer code/js_of_ocaml code/dune code/irmin \
+	code/ocamlformat code/js_of_ocaml code/dune code/irmin \
 	code/dune-release code/mirage code/ppxlib code/base
 
 # Extra test directories, for which looser checking is done
-XDIRS=code/ocaml
+XDIRS=code/ocaml code/infer
 
 # Directories to ignore (given to find, compared literally)
 PRUNE_DIRS= \
diff --git a/test/failing/tests/unit_lex.ml.broken-ref b/test/failing/tests/unit_lex.ml.broken-ref
index fd57965f35..e767e1d6c8 100644
--- a/test/failing/tests/unit_lex.ml.broken-ref
+++ b/test/failing/tests/unit_lex.ml.broken-ref
@@ -1,10 +1,5 @@
+ocamlformat: ignoring "tests/unit_lex.ml" (syntax error)
 File "tests/unit_lex.ml", line 18, characters 4-10:
 18 |     ������ (* this file must be iso-8859-1 *)
          ^^^^^^
-Alert deprecated: ISO-Latin1 characters in identifiers
-ocamlformat: ignoring "tests/unit_lex.ml" (syntax error)
-
-File "tests/unit_lex.ml", line 55, characters 2-8:
-55 |   '\999'; (* wrong, but yet... *)
-       ^^^^^^
-Error: Illegal backslash escape in string or character ('\999'): 999 is outside the range of legal characters (0-255).
+Error: Invalid encoding of identifier ������.
diff --git a/test/failing/tests/unit_values.ml.broken-ref b/test/failing/tests/unit_values.ml.broken-ref
index 95354696a4..3475fbf338 100644
--- a/test/failing/tests/unit_values.ml.broken-ref
+++ b/test/failing/tests/unit_values.ml.broken-ref
@@ -1,10 +1,5 @@
-File "tests/unit_values.ml", line 6, characters 10-11:
-6 | let i32 = −1073741824, 1073741823
-              ^
-Alert deprecated: ISO-Latin1 characters in identifiers
 ocamlformat: ignoring "tests/unit_values.ml" (syntax error)
-
-File "tests/unit_values.ml", line 6, characters 11-12:
+File "tests/unit_values.ml", line 6, characters 10-23:
 6 | let i32 = −1073741824, 1073741823
-               ^
-Error: Illegal character (\136)
+              ^^^^^^^^^^^^^
+Error: Invalid character U+2212 in identifier
diff --git a/test/passing/gen/dune.inc b/test/passing/gen/dune.inc
index e78aeaea66..ff65935a81 100644
--- a/test/passing/gen/dune.inc
+++ b/test/passing/gen/dune.inc
@@ -4525,6 +4525,21 @@
  (alias runtest)
  (action (diff use_file.mlt.err use_file.mlt.stderr)))
 
+(rule
+ (deps .ocamlformat dune-project)
+ (action
+  (with-stdout-to utf8_identifiers.ml.stdout
+   (with-stderr-to utf8_identifiers.ml.stderr
+     (run %{bin:ocamlformat} --name utf8_identifiers.ml --margin-check %{dep:../tests/utf8_identifiers.ml})))))
+
+(rule
+ (alias runtest)
+ (action (diff utf8_identifiers.ml.ref utf8_identifiers.ml.stdout)))
+
+(rule
+ (alias runtest)
+ (action (diff utf8_identifiers.ml.err utf8_identifiers.ml.stderr)))
+
 (rule
  (deps .ocamlformat dune-project)
  (action
diff --git a/test/passing/refs.default/utf8_identifiers.ml.ref b/test/passing/refs.default/utf8_identifiers.ml.ref
new file mode 100644
index 0000000000..683cf9c3ae
--- /dev/null
+++ b/test/passing/refs.default/utf8_identifiers.ml.ref
@@ -0,0 +1,18 @@
+(* TEST
+readonly_files = "genfiles.ml";
+setup-ocamlc.byte-build-env;
+all_modules = "genfiles.ml";
+program = "./genfiles.byte.exe";
+ocamlc.byte;
+run;
+all_modules = "été.ml ça.ml test.ml";
+program = "./main.byte.exe";
+ocamlc.byte;
+run;
+*)
+
+let _ =
+  (* Source is NFC *)
+  assert (Été.x + Ça.x = 3);
+  (* Source is NFD *)
+  assert (Été.x + Ça.x = 3)
diff --git a/test/passing/refs.janestreet/utf8_identifiers.ml.ref b/test/passing/refs.janestreet/utf8_identifiers.ml.ref
new file mode 100644
index 0000000000..87f5c89d00
--- /dev/null
+++ b/test/passing/refs.janestreet/utf8_identifiers.ml.ref
@@ -0,0 +1,19 @@
+(* TEST
+readonly_files = "genfiles.ml";
+setup-ocamlc.byte-build-env;
+all_modules = "genfiles.ml";
+program = "./genfiles.byte.exe";
+ocamlc.byte;
+run;
+all_modules = "été.ml ça.ml test.ml";
+program = "./main.byte.exe";
+ocamlc.byte;
+run;
+*)
+
+let _ =
+  (* Source is NFC *)
+  assert (Été.x + Ça.x = 3);
+  (* Source is NFD *)
+  assert (Été.x + Ça.x = 3)
+;;
diff --git a/test/passing/refs.ocamlformat/utf8_identifiers.ml.ref b/test/passing/refs.ocamlformat/utf8_identifiers.ml.ref
new file mode 100644
index 0000000000..f28103171f
--- /dev/null
+++ b/test/passing/refs.ocamlformat/utf8_identifiers.ml.ref
@@ -0,0 +1,18 @@
+(* TEST
+readonly_files = "genfiles.ml";
+setup-ocamlc.byte-build-env;
+all_modules = "genfiles.ml";
+program = "./genfiles.byte.exe";
+ocamlc.byte;
+run;
+all_modules = "été.ml ça.ml test.ml";
+program = "./main.byte.exe";
+ocamlc.byte;
+run;
+*)
+
+let _ =
+  (* Source is NFC *)
+  assert (Été.x + Ça.x = 3) ;
+  (* Source is NFD *)
+  assert (Été.x + Ça.x = 3)
diff --git a/test/passing/tests/utf8_identifiers.ml b/test/passing/tests/utf8_identifiers.ml
new file mode 100644
index 0000000000..b9c8cb0d5b
--- /dev/null
+++ b/test/passing/tests/utf8_identifiers.ml
@@ -0,0 +1,18 @@
+(* TEST
+readonly_files = "genfiles.ml";
+setup-ocamlc.byte-build-env;
+all_modules = "genfiles.ml";
+program = "./genfiles.byte.exe";
+ocamlc.byte;
+run;
+all_modules = "été.ml ça.ml test.ml";
+program = "./main.byte.exe";
+ocamlc.byte;
+run;
+*)
+
+let _ =
+  (* Source is NFC *)
+  assert (Été.x + Ça.x = 3);
+  (* Source is NFD *)
+  assert (Été.x + Ça.x = 3)
diff --git a/vendor/parser-extended/lexer.mll b/vendor/parser-extended/lexer.mll
index 4abdb11f19..e1941d0a63 100644
--- a/vendor/parser-extended/lexer.mll
+++ b/vendor/parser-extended/lexer.mll
@@ -29,8 +29,13 @@ type error =
   | Unterminated_string_in_comment of Location.t * Location.t
   | Empty_character_literal
   | Keyword_as_label of string
+  | Capitalized_label of string
   | Invalid_literal of string
   | Invalid_directive of string * string option
+  | Invalid_encoding of string
+  | Invalid_char_in_ident of Uchar.t
+  | Non_lowercase_delimiter of string
+  | Capitalized_raw_identifier of string
 
 exception Error of error * Location.t
 
@@ -228,10 +233,46 @@ let uchar_for_uchar_escape lexbuf =
       illegal_escape lexbuf
         (Printf.sprintf "%X is not a Unicode scalar value" cp)
 
+let validate_encoding lexbuf raw_name =
+  match Utf8_lexeme.normalize raw_name with
+  | Error _ -> error lexbuf (Invalid_encoding raw_name)
+  | Ok name -> name
+
+let ident_for_extended lexbuf raw_name =
+  let name = validate_encoding lexbuf raw_name in
+  match Utf8_lexeme.validate_identifier name with
+  | Utf8_lexeme.Valid -> name
+  | Utf8_lexeme.Invalid_character u -> error lexbuf (Invalid_char_in_ident u)
+  | Utf8_lexeme.Invalid_beginning _ ->
+  assert false (* excluded by the regexps *)
+
+let validate_delim lexbuf raw_name =
+  let name = validate_encoding lexbuf raw_name in
+  if Utf8_lexeme.is_lowercase name then name
+  else error lexbuf (Non_lowercase_delimiter name)
+
+let validate_ext lexbuf name =
+    let name = validate_encoding lexbuf name in
+    match Utf8_lexeme.validate_identifier ~with_dot:true name with
+    | Utf8_lexeme.Valid -> name
+    | Utf8_lexeme.Invalid_character u -> error lexbuf (Invalid_char_in_ident u)
+    | Utf8_lexeme.Invalid_beginning _ ->
+    assert false (* excluded by the regexps *)
+
+let lax_delim raw_name =
+  match Utf8_lexeme.normalize raw_name with
+  | Error _ -> None
+  | Ok name ->
+     if Utf8_lexeme.is_lowercase name then Some name
+     else None
+
 let is_keyword name = Hashtbl.mem keyword_table name
 
-let check_label_name lexbuf name =
-  if is_keyword name then error lexbuf (Keyword_as_label name)
+let check_label_name ?(raw_escape=false) lexbuf name =
+  if Utf8_lexeme.is_capitalized name then
+    error lexbuf (Capitalized_label name);
+  if not raw_escape && is_keyword name then
+    error lexbuf (Keyword_as_label name)
 
 (* To "unlex" a few characters *)
 let set_lexeme_length buf n = (
@@ -282,13 +323,6 @@ let preprocessor = ref None
 
 let escaped_newlines = ref false
 
-(* Warn about Latin-1 characters used in idents *)
-
-let warn_latin1 lexbuf =
-  Location.deprecated
-    (Location.curr lexbuf)
-    "ISO-Latin1 characters in identifiers"
-
 type comment = [ `Comment of string | `Docstring of string ]
 
 let handle_docstrings = ref true
@@ -340,6 +374,10 @@ let prepare_error loc = function
   | Keyword_as_label kwd ->
       Location.errorf ~loc
         "%a is a keyword, it cannot be used as label name" Style.inline_code kwd
+  | Capitalized_label lbl ->
+      Location.errorf ~loc
+        "%a cannot be used as label name, \
+         it must start with a lowercase letter" Style.inline_code lbl
   | Invalid_literal s ->
       Location.errorf ~loc "Invalid literal %s" s
   | Invalid_directive (dir, explanation) ->
@@ -347,6 +385,20 @@ let prepare_error loc = function
         (fun ppf -> match explanation with
            | None -> ()
            | Some expl -> fprintf ppf ": %s" expl)
+  | Invalid_encoding s ->
+    Location.errorf ~loc "Invalid encoding of identifier %s." s
+  | Invalid_char_in_ident u ->
+      Location.errorf ~loc "Invalid character U+%X in identifier"
+         (Uchar.to_int u)
+  | Capitalized_raw_identifier lbl ->
+      Location.errorf ~loc
+        "%a cannot be used as a raw identifier, \
+         it must start with a lowercase letter" Style.inline_code lbl
+  | Non_lowercase_delimiter name ->
+      Location.errorf ~loc
+        "%a cannot be used as a quoted string delimiter,@ \
+         it must contain only lowercase letters."
+         Style.inline_code name
 
 let () =
   Location.register_error_of_exn
@@ -363,12 +415,11 @@ let newline = ('\013'* '\010')
 let blank = [' ' '\009' '\012']
 let lowercase = ['a'-'z' '_']
 let uppercase = ['A'-'Z']
+let identstart = lowercase | uppercase
 let identchar = ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9']
-let lowercase_latin1 = ['a'-'z' '\223'-'\246' '\248'-'\255' '_']
-let uppercase_latin1 = ['A'-'Z' '\192'-'\214' '\216'-'\222']
-let identchar_latin1 =
-  ['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '\'' '0'-'9']
-(* This should be kept in sync with the [is_identchar] function in [env.ml] *)
+let utf8 = ['\192'-'\255'] ['\128'-'\191']*
+let identstart_ext = identstart | utf8
+let identchar_ext = identchar | utf8
 
 let symbolchar =
   ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~']
@@ -380,7 +431,8 @@ let kwdopchar =
   ['$' '&' '*' '+' '-' '/' '<' '=' '>' '@' '^' '|']
 
 let ident = (lowercase | uppercase) identchar*
-let extattrident = ident ('.' ident)*
+let ident_ext = identstart_ext  identchar_ext*
+let extattrident = ident_ext ('.' ident_ext)*
 
 let decimal_literal =
   ['0'-'9'] ['0'-'9' '_']*
@@ -423,36 +475,40 @@ rule token = parse
   | ".~"
       { error lexbuf
           (Reserved_sequence (".~", Some "is reserved for use in MetaOCaml")) }
-  | "~" (raw_ident_escape lowercase identchar * as name) ':'
-      { LABEL name }
-  | "~" (lowercase identchar * as name) ':'
+  | "~" (identstart identchar * as name) ':'
       { check_label_name lexbuf name;
         LABEL name }
-  | "~" (lowercase_latin1 identchar_latin1 * as name) ':'
-      { warn_latin1 lexbuf;
-        LABEL name }
+  | "~" (raw_ident_escape? as escape) (ident_ext as raw_name) ':'
+      { let name = ident_for_extended lexbuf raw_name in
+        check_label_name ~raw_escape:(escape<>"") lexbuf name;
+        LABEL (escape ^ name) }
   | "?"
       { QUESTION }
-  | "?" (raw_ident_escape lowercase identchar * as name) ':'
-      { OPTLABEL name }
   | "?" (lowercase identchar * as name) ':'
       { check_label_name lexbuf name;
         OPTLABEL name }
-  | "?" (lowercase_latin1 identchar_latin1 * as name) ':'
-      { warn_latin1 lexbuf;
-        OPTLABEL name }
-  | (raw_ident_escape lowercase identchar * as name)
-      (* Compared to upstream, the raw_ident_escape is part of the lident. *)
-      { LIDENT name }
+  | "?" (raw_ident_escape? as escape) (ident_ext as raw_name) ':'
+      { let name = ident_for_extended lexbuf raw_name in
+        check_label_name ~raw_escape:(escape<>"") lexbuf name;
+        OPTLABEL (escape ^ name)
+      }
   | lowercase identchar * as name
       { try Hashtbl.find keyword_table name
         with Not_found -> LIDENT name }
-  | lowercase_latin1 identchar_latin1 * as name
-      { warn_latin1 lexbuf; LIDENT name }
   | uppercase identchar * as name
       { UIDENT name } (* No capitalized keywords *)
-  | uppercase_latin1 identchar_latin1 * as name
-      { warn_latin1 lexbuf; UIDENT name }
+  | (raw_ident_escape? as escape) (ident_ext as raw_name)
+      { let name = ident_for_extended lexbuf raw_name in
+        if Utf8_lexeme.is_capitalized name then begin
+            if escape="" then UIDENT name
+            else
+              (* we don't have capitalized keywords, and thus no needs for
+                 capitalized raw identifiers. *)
+              error lexbuf (Capitalized_raw_identifier name)
+        end else
+          (* Compared to upstream, the raw_ident_escape is part of the lident. *)
+          LIDENT (escape ^ name)
+      } (* No non-ascii keywords *)
   | int_literal as lit { INT (lit, None) }
   | (int_literal as lit) (literal_modifier as modif)
       { INT (lit, Some modif) }
@@ -465,26 +521,34 @@ rule token = parse
   | "\""
       { let s, loc = wrap_string_lexer string lexbuf in
         STRING (s, loc, None) }
-  | "{" (lowercase* as delim) "|"
-      { let s, loc = wrap_string_lexer (quoted_string delim) lexbuf in
-        STRING (s, loc, Some delim) }
-  | "{%" (extattrident as id) "|"
+  | "{" (ident_ext? as raw_name) '|'
+      { let delim = validate_delim lexbuf raw_name in
+        let s, loc = wrap_string_lexer (quoted_string delim) lexbuf in
+        STRING (s, loc, Some delim)
+       }
+  | "{%" (extattrident as raw_id) "|"
       { let orig_loc = Location.curr lexbuf in
+        let id = validate_ext lexbuf raw_id in
         let s, loc = wrap_string_lexer (quoted_string "") lexbuf in
         let idloc = compute_quoted_string_idloc orig_loc 2 id in
         QUOTED_STRING_EXPR (id, idloc, s, loc, Some "") }
-  | "{%" (extattrident as id) blank+ (lowercase* as delim) "|"
+  | "{%" (extattrident as raw_id) blank+ (ident_ext as raw_delim) "|"
       { let orig_loc = Location.curr lexbuf in
+        let id = validate_ext lexbuf raw_id in
+        let delim = validate_delim lexbuf raw_delim in
         let s, loc = wrap_string_lexer (quoted_string delim) lexbuf in
         let idloc = compute_quoted_string_idloc orig_loc 2 id in
         QUOTED_STRING_EXPR (id, idloc, s, loc, Some delim) }
-  | "{%%" (extattrident as id) "|"
+  | "{%%" (extattrident as raw_id) "|"
       { let orig_loc = Location.curr lexbuf in
+        let id = validate_ext lexbuf raw_id in
         let s, loc = wrap_string_lexer (quoted_string "") lexbuf in
         let idloc = compute_quoted_string_idloc orig_loc 3 id in
         QUOTED_STRING_ITEM (id, idloc, s, loc, Some "") }
-  | "{%%" (extattrident as id) blank+ (lowercase* as delim) "|"
+  | "{%%" (extattrident as raw_id) blank+ (ident_ext as raw_delim) "|"
       { let orig_loc = Location.curr lexbuf in
+        let id = validate_ext lexbuf raw_id in
+        let delim = validate_delim lexbuf raw_delim in
         let s, loc = wrap_string_lexer (quoted_string delim) lexbuf in
         let idloc = compute_quoted_string_idloc orig_loc 3 id in
         QUOTED_STRING_ITEM (id, idloc, s, loc, Some delim) }
@@ -664,8 +728,10 @@ and comment = parse
         is_in_string := false;
         store_string_char '\"';
         comment lexbuf }
-  | "{" ('%' '%'? extattrident blank*)? (lowercase* as delim) "|"
-      {
+  | "{" ('%' '%'? extattrident blank*)? (ident_ext? as raw_delim) "|"
+      { match lax_delim raw_delim with
+        | None -> store_lexeme lexbuf; comment lexbuf
+        | Some delim ->
         string_start_loc := Location.curr lexbuf;
         store_lexeme lexbuf;
         is_in_string := true;
@@ -775,8 +841,9 @@ and quoted_string delim = parse
   | eof
       { is_in_string := false;
         error_loc !string_start_loc Unterminated_string }
-  | "|" (lowercase* as edelim) "}"
+  | "|" (ident_ext? as raw_edelim) "}"
       {
+        let edelim = validate_encoding lexbuf raw_edelim in
         if delim = edelim then lexbuf.lex_start_p
         else (store_lexeme lexbuf; quoted_string delim lexbuf)
       }
diff --git a/vendor/parser-shims/misc_.ml b/vendor/parser-shims/misc_.ml
new file mode 100644
index 0000000000..e8bf50687d
--- /dev/null
+++ b/vendor/parser-shims/misc_.ml
@@ -0,0 +1,393 @@
+(** {1 Minimal support for Unicode characters in identifiers} *)
+
+include Misc
+
+module Color = struct
+  include Color
+
+  external isatty : out_channel -> bool = "caml_sys_isatty"
+
+  (* reasonable heuristic on whether colors should be enabled *)
+  let should_enable_color () =
+    let term = try Sys.getenv "TERM" with Not_found -> "" in
+    term <> "dumb"
+    && term <> ""
+    && isatty stderr
+
+  let default_setting = Auto
+  let enabled = ref true
+end
+
+module Error_style = struct
+  include Error_style
+
+  let default_setting = Contextual
+end
+
+(* Terminal styling handling *)
+module Style = struct
+  (* use ANSI color codes, see https://en.wikipedia.org/wiki/ANSI_escape_code *)
+  type color =
+    | Black
+    | Red
+    | Green
+    | Yellow
+    | Blue
+    | Magenta
+    | Cyan
+    | White
+
+  type style =
+    | FG of color (* foreground *)
+    | BG of color (* background *)
+    | Bold
+    | Reset
+
+  let ansi_of_color = function
+    | Black -> "0"
+    | Red -> "1"
+    | Green -> "2"
+    | Yellow -> "3"
+    | Blue -> "4"
+    | Magenta -> "5"
+    | Cyan -> "6"
+    | White -> "7"
+
+  let code_of_style = function
+    | FG c -> "3" ^ ansi_of_color c
+    | BG c -> "4" ^ ansi_of_color c
+    | Bold -> "1"
+    | Reset -> "0"
+
+  let ansi_of_style_l l =
+    let s = match l with
+      | [] -> code_of_style Reset
+      | [s] -> code_of_style s
+      | _ -> String.concat ";" (List.map code_of_style l)
+    in
+    "\x1b[" ^ s ^ "m"
+
+  type Format.stag += Style of style list
+
+  type tag_style ={
+    ansi: style list;
+    text_open:string;
+    text_close:string
+  }
+
+  type styles = {
+    error: tag_style;
+    warning: tag_style;
+    loc: tag_style;
+    hint: tag_style;
+    inline_code: tag_style;
+  }
+
+  let no_markup stl = { ansi = stl; text_close = ""; text_open = "" }
+
+  let default_styles = {
+      warning = no_markup [Bold; FG Magenta];
+      error = no_markup [Bold; FG Red];
+      loc = no_markup [Bold];
+      hint = no_markup [Bold; FG Blue];
+      inline_code= { ansi=[Bold]; text_open = {|"|}; text_close = {|"|} }
+    }
+
+  let cur_styles = ref default_styles
+  let get_styles () = !cur_styles
+  let set_styles s = cur_styles := s
+
+  (* map a tag to a style, if the tag is known.
+   @raise Not_found otherwise *)
+  let style_of_tag s = match s with
+    | Format.String_tag "error" ->  (!cur_styles).error
+    | Format.String_tag "warning" ->(!cur_styles).warning
+    | Format.String_tag "loc" -> (!cur_styles).loc
+    | Format.String_tag "hint" -> (!cur_styles).hint
+    | Format.String_tag "inline_code" -> (!cur_styles).inline_code
+    | Style s -> no_markup s
+    | _ -> raise Not_found
+
+
+  let as_inline_code printer ppf x =
+    let open Format_doc in
+    pp_open_stag ppf (Format.String_tag "inline_code");
+    printer ppf x;
+    pp_close_stag ppf ()
+
+  let inline_code ppf s = as_inline_code Format_doc.pp_print_string ppf s
+
+  (* either prints the tag of [s] or delegates to [or_else] *)
+  let mark_open_tag ~or_else s =
+    try
+      let style = style_of_tag s in
+      if !Color.enabled then ansi_of_style_l style.ansi else style.text_open
+    with Not_found -> or_else s
+
+  let mark_close_tag ~or_else s =
+    try
+      let style = style_of_tag s in
+      if !Color.enabled then ansi_of_style_l [Reset] else style.text_close
+    with Not_found -> or_else s
+
+  (* add tag handling to formatter [ppf] *)
+  let set_tag_handling ppf =
+    let open Format in
+    let functions = pp_get_formatter_stag_functions ppf () in
+    let functions' = {functions with
+      mark_open_stag=(mark_open_tag ~or_else:functions.mark_open_stag);
+      mark_close_stag=(mark_close_tag ~or_else:functions.mark_close_stag);
+    } in
+    pp_set_mark_tags ppf true; (* enable tags *)
+    pp_set_formatter_stag_functions ppf functions';
+    ()
+
+  let setup =
+    let first = ref true in (* initialize only once *)
+    let formatter_l =
+      [Format.std_formatter; Format.err_formatter; Format.str_formatter]
+    in
+    let enable_color = function
+      | Color.Auto -> Color.should_enable_color ()
+      | Color.Always -> true
+      | Color.Never -> false
+    in
+    fun o ->
+      if !first then (
+        first := false;
+        Format.set_mark_tags true;
+        List.iter set_tag_handling formatter_l;
+        Color.enabled := (match o with
+          | Some s -> enable_color s
+          | None -> enable_color Color.default_setting)
+      );
+      ()
+end
+
+module Utf8_lexeme = struct
+
+  type t = string
+
+  (* Non-ASCII letters that are allowed in identifiers (currently: Latin-9) *)
+
+  type case = Upper of Uchar.t | Lower of Uchar.t
+  let known_chars : (Uchar.t, case) Hashtbl.t = Hashtbl.create 32
+
+  let _ =
+    List.iter
+      (fun (upper, lower) ->
+        let upper = Uchar.of_int upper and lower = Uchar.of_int lower in
+        Hashtbl.add known_chars upper (Upper lower);
+        Hashtbl.add known_chars lower (Lower upper))
+  [
+    (0xc0, 0xe0); (* À, à *)    (0xc1, 0xe1); (* Á, á *)
+    (0xc2, 0xe2); (* Â, â *)    (0xc3, 0xe3); (* Ã, ã *)
+    (0xc4, 0xe4); (* Ä, ä *)    (0xc5, 0xe5); (* Å, å *)
+    (0xc6, 0xe6); (* Æ, æ *)    (0xc7, 0xe7); (* Ç, ç *)
+    (0xc8, 0xe8); (* È, è *)    (0xc9, 0xe9); (* É, é *)
+    (0xca, 0xea); (* Ê, ê *)    (0xcb, 0xeb); (* Ë, ë *)
+    (0xcc, 0xec); (* Ì, ì *)    (0xcd, 0xed); (* Í, í *)
+    (0xce, 0xee); (* Î, î *)    (0xcf, 0xef); (* Ï, ï *)
+    (0xd0, 0xf0); (* Ð, ð *)    (0xd1, 0xf1); (* Ñ, ñ *)
+    (0xd2, 0xf2); (* Ò, ò *)    (0xd3, 0xf3); (* Ó, ó *)
+    (0xd4, 0xf4); (* Ô, ô *)    (0xd5, 0xf5); (* Õ, õ *)
+    (0xd6, 0xf6); (* Ö, ö *)    (0xd8, 0xf8); (* Ø, ø *)
+    (0xd9, 0xf9); (* Ù, ù *)    (0xda, 0xfa); (* Ú, ú *)
+    (0xdb, 0xfb); (* Û, û *)    (0xdc, 0xfc); (* Ü, ü *)
+    (0xdd, 0xfd); (* Ý, ý *)    (0xde, 0xfe); (* Þ, þ *)
+    (0x160, 0x161); (* Š, š *)  (0x17d, 0x17e); (* Ž, ž *)
+    (0x152, 0x153); (* Œ, œ *)  (0x178, 0xff); (* Ÿ, ÿ *)
+    (0x1e9e, 0xdf); (* ẞ, ß *)
+  ]
+
+  (* NFD to NFC conversion table for the letters above *)
+
+  let known_pairs : (Uchar.t * Uchar.t, Uchar.t) Hashtbl.t = Hashtbl.create 32
+
+  let _ =
+    List.iter
+      (fun (c1, n2, n) ->
+        Hashtbl.add known_pairs
+          (Uchar.of_char c1, Uchar.of_int n2) (Uchar.of_int n))
+  [
+    ('A', 0x300, 0xc0); (* À *)    ('A', 0x301, 0xc1); (* Á *)
+    ('A', 0x302, 0xc2); (* Â *)    ('A', 0x303, 0xc3); (* Ã *)
+    ('A', 0x308, 0xc4); (* Ä *)    ('A', 0x30a, 0xc5); (* Å *)
+    ('C', 0x327, 0xc7); (* Ç *)    ('E', 0x300, 0xc8); (* È *)
+    ('E', 0x301, 0xc9); (* É *)    ('E', 0x302, 0xca); (* Ê *)
+    ('E', 0x308, 0xcb); (* Ë *)    ('I', 0x300, 0xcc); (* Ì *)
+    ('I', 0x301, 0xcd); (* Í *)    ('I', 0x302, 0xce); (* Î *)
+    ('I', 0x308, 0xcf); (* Ï *)    ('N', 0x303, 0xd1); (* Ñ *)
+    ('O', 0x300, 0xd2); (* Ò *)    ('O', 0x301, 0xd3); (* Ó *)
+    ('O', 0x302, 0xd4); (* Ô *)    ('O', 0x303, 0xd5); (* Õ *)
+    ('O', 0x308, 0xd6); (* Ö *)
+    ('U', 0x300, 0xd9); (* Ù *)    ('U', 0x301, 0xda); (* Ú *)
+    ('U', 0x302, 0xdb); (* Û *)    ('U', 0x308, 0xdc); (* Ü *)
+    ('Y', 0x301, 0xdd); (* Ý *)    ('Y', 0x308, 0x178);  (* Ÿ *)
+    ('S', 0x30c, 0x160); (* Š *)   ('Z', 0x30c, 0x17d); (* Ž *)
+    ('a', 0x300, 0xe0); (* à *)    ('a', 0x301, 0xe1); (* á *)
+    ('a', 0x302, 0xe2); (* â *)    ('a', 0x303, 0xe3); (* ã *)
+    ('a', 0x308, 0xe4); (* ä *)    ('a', 0x30a, 0xe5); (* å *)
+    ('c', 0x327, 0xe7); (* ç *)    ('e', 0x300, 0xe8); (* è *)
+    ('e', 0x301, 0xe9); (* é *)    ('e', 0x302, 0xea); (* ê *)
+    ('e', 0x308, 0xeb); (* ë *)    ('i', 0x300, 0xec); (* ì *)
+    ('i', 0x301, 0xed); (* í *)    ('i', 0x302, 0xee); (* î *)
+    ('i', 0x308, 0xef); (* ï *)    ('n', 0x303, 0xf1); (* ñ *)
+    ('o', 0x300, 0xf2); (* ò *)    ('o', 0x301, 0xf3); (* ó *)
+    ('o', 0x302, 0xf4); (* ô *)    ('o', 0x303, 0xf5); (* õ *)
+    ('o', 0x308, 0xf6); (* ö *)
+    ('u', 0x300, 0xf9); (* ù *)    ('u', 0x301, 0xfa); (* ú *)
+    ('u', 0x302, 0xfb); (* û *)    ('u', 0x308, 0xfc); (* ü *)
+    ('y', 0x301, 0xfd); (* ý *)    ('y', 0x308, 0xff); (* ÿ *)
+    ('s', 0x30c, 0x161); (* š *)   ('z', 0x30c, 0x17e); (* ž *)
+  ]
+
+  let normalize_generic ~keep_ascii transform s =
+    let rec norm check buf prev i =
+      if i >= String.length s then begin
+        Buffer.add_utf_8_uchar buf (transform prev)
+      end else begin
+        let d = String.get_utf_8_uchar s i in
+        let u = Uchar.utf_decode_uchar d in
+        check d u;
+        let i' = i + Uchar.utf_decode_length d in
+        match Hashtbl.find_opt known_pairs (prev, u) with
+        | Some u' ->
+            norm check buf u' i'
+        | None ->
+            Buffer.add_utf_8_uchar buf (transform prev);
+            norm check buf u i'
+      end in
+    let ascii_limit = 128 in
+    if s = ""
+    || keep_ascii && String.for_all (fun x -> Char.code x < ascii_limit) s
+    then Ok s
+    else
+      let buf = Buffer.create (String.length s) in
+      let valid = ref true in
+      let check d u =
+        valid := !valid && Uchar.utf_decode_is_valid d && u <> Uchar.rep
+      in
+      let d = String.get_utf_8_uchar s 0 in
+      let u = Uchar.utf_decode_uchar d in
+      check d u;
+      norm check buf u (Uchar.utf_decode_length d);
+      let contents = Buffer.contents buf in
+      if !valid then
+        Ok contents
+      else
+        Error contents
+
+  let normalize s =
+    normalize_generic ~keep_ascii:true (fun u -> u) s
+
+  (* Capitalization *)
+
+  let uchar_is_uppercase u =
+    let c = Uchar.to_int u in
+    if c < 0x80 then c >= 65 && c <= 90 else
+      match Hashtbl.find_opt known_chars u with
+      | Some(Upper _) -> true
+      | _ -> false
+
+  let uchar_lowercase u =
+    let c = Uchar.to_int u in
+    if c < 0x80 then
+      if c >= 65 && c <= 90 then Uchar.of_int (c + 32) else u
+    else
+      match Hashtbl.find_opt known_chars u with
+      | Some(Upper u') -> u'
+      | _ -> u
+
+  let uchar_uppercase u =
+    let c = Uchar.to_int u in
+    if c < 0x80 then
+      if c >= 97 && c <= 122 then Uchar.of_int (c - 32) else u
+    else
+      match Hashtbl.find_opt known_chars u with
+      | Some(Lower u') -> u'
+      | _ -> u
+
+  let capitalize s =
+    let first = ref true in
+    normalize_generic ~keep_ascii:false
+      (fun u -> if !first then (first := false; uchar_uppercase u) else u)
+      s
+
+  let uncapitalize s =
+    let first = ref true in
+    normalize_generic ~keep_ascii:false
+      (fun u -> if !first then (first := false; uchar_lowercase u) else u)
+      s
+
+  let is_capitalized s =
+    s <> "" &&
+    uchar_is_uppercase (Uchar.utf_decode_uchar (String.get_utf_8_uchar s 0))
+
+  (* Characters allowed in identifiers after normalization is applied.
+     Currently:
+       - ASCII letters, underscore
+       - Latin-9 letters, represented in NFC
+       - ASCII digits, single quote (but not as first character)
+       - dot if [with_dot] = true
+  *)
+  let uchar_valid_in_identifier ~with_dot u =
+    let c = Uchar.to_int u in
+    if c < 0x80 then
+         c >= 97 (* a *) && c <= 122 (* z *)
+      || c >= 65 (* A *) && c <= 90 (* Z *)
+      || c >= 48 (* 0 *) && c <= 57 (* 9 *)
+      || c = 95 (* underscore *)
+      || c = 39 (* single quote *)
+      || (with_dot && c = 46) (* dot *)
+    else
+      Hashtbl.mem known_chars u
+
+  let uchar_not_identifier_start u =
+    let c = Uchar.to_int u in
+       c >= 48 (* 0 *) && c <= 57 (* 9 *)
+    || c = 39  (* single quote *)
+
+  (* Check whether a normalized string is a valid OCaml identifier. *)
+
+  type validation_result =
+    | Valid
+    | Invalid_character of Uchar.t   (** Character not allowed *)
+    | Invalid_beginning of Uchar.t   (** Character not allowed as first char *)
+
+  let validate_identifier ?(with_dot=false) s =
+    let rec check i =
+      if i >= String.length s then Valid else begin
+        let d = String.get_utf_8_uchar s i in
+        let u = Uchar.utf_decode_uchar d in
+        let i' = i + Uchar.utf_decode_length d in
+        if not (uchar_valid_in_identifier ~with_dot u) then
+          Invalid_character u
+        else if i = 0 && uchar_not_identifier_start u then
+          Invalid_beginning u
+        else
+          check i'
+      end
+    in check 0
+
+  let is_valid_identifier s =
+    validate_identifier s = Valid
+
+  let starts_like_a_valid_identifier s =
+    s <> "" &&
+    (let u = Uchar.utf_decode_uchar (String.get_utf_8_uchar s 0) in
+     uchar_valid_in_identifier ~with_dot:false u
+     && not (uchar_not_identifier_start u))
+
+  let is_lowercase s =
+    let rec is_lowercase_at len s n =
+      if n >= len then true
+      else
+        let d = String.get_utf_8_uchar s n in
+        let u = Uchar.utf_decode_uchar d in
+        (uchar_valid_in_identifier ~with_dot:false  u)
+        && not (uchar_is_uppercase u)
+        && is_lowercase_at len s (n+Uchar.utf_decode_length d)
+    in
+    is_lowercase_at (String.length s) s 0
+end
diff --git a/vendor/parser-shims/ocamlformat_parser_shims.ml b/vendor/parser-shims/ocamlformat_parser_shims.ml
index b9d3f34885..1420bf7ba3 100644
--- a/vendor/parser-shims/ocamlformat_parser_shims.ml
+++ b/vendor/parser-shims/ocamlformat_parser_shims.ml
@@ -2,171 +2,7 @@
    shims. *)
 include Ocamlformat_stdlib_shims
 
-module Misc = struct
-  include Misc
-
-  module Color = struct
-    include Color
-
-    external isatty : out_channel -> bool = "caml_sys_isatty"
-
-    (* reasonable heuristic on whether colors should be enabled *)
-    let should_enable_color () =
-      let term = try Sys.getenv "TERM" with Not_found -> "" in
-      term <> "dumb"
-      && term <> ""
-      && isatty stderr
-
-    let default_setting = Auto
-    let enabled = ref true
-  end
-
-  module Error_style = struct
-    include Error_style
-
-    let default_setting = Contextual
-  end
-
-  (* Terminal styling handling *)
-  module Style = struct
-    (* use ANSI color codes, see https://en.wikipedia.org/wiki/ANSI_escape_code *)
-    type color =
-      | Black
-      | Red
-      | Green
-      | Yellow
-      | Blue
-      | Magenta
-      | Cyan
-      | White
-
-    type style =
-      | FG of color (* foreground *)
-      | BG of color (* background *)
-      | Bold
-      | Reset
-
-    let ansi_of_color = function
-      | Black -> "0"
-      | Red -> "1"
-      | Green -> "2"
-      | Yellow -> "3"
-      | Blue -> "4"
-      | Magenta -> "5"
-      | Cyan -> "6"
-      | White -> "7"
-
-    let code_of_style = function
-      | FG c -> "3" ^ ansi_of_color c
-      | BG c -> "4" ^ ansi_of_color c
-      | Bold -> "1"
-      | Reset -> "0"
-
-    let ansi_of_style_l l =
-      let s = match l with
-        | [] -> code_of_style Reset
-        | [s] -> code_of_style s
-        | _ -> String.concat ";" (List.map code_of_style l)
-      in
-      "\x1b[" ^ s ^ "m"
-
-    type Format.stag += Style of style list
-
-    type tag_style ={
-      ansi: style list;
-      text_open:string;
-      text_close:string
-    }
-
-    type styles = {
-      error: tag_style;
-      warning: tag_style;
-      loc: tag_style;
-      hint: tag_style;
-      inline_code: tag_style;
-    }
-
-    let no_markup stl = { ansi = stl; text_close = ""; text_open = "" }
-
-    let default_styles = {
-        warning = no_markup [Bold; FG Magenta];
-        error = no_markup [Bold; FG Red];
-        loc = no_markup [Bold];
-        hint = no_markup [Bold; FG Blue];
-        inline_code= { ansi=[Bold]; text_open = {|"|}; text_close = {|"|} }
-      }
-
-    let cur_styles = ref default_styles
-    let get_styles () = !cur_styles
-    let set_styles s = cur_styles := s
-
-    (* map a tag to a style, if the tag is known.
-     @raise Not_found otherwise *)
-    let style_of_tag s = match s with
-      | Format.String_tag "error" ->  (!cur_styles).error
-      | Format.String_tag "warning" ->(!cur_styles).warning
-      | Format.String_tag "loc" -> (!cur_styles).loc
-      | Format.String_tag "hint" -> (!cur_styles).hint
-      | Format.String_tag "inline_code" -> (!cur_styles).inline_code
-      | Style s -> no_markup s
-      | _ -> raise Not_found
-
-
-    let as_inline_code printer ppf x =
-      let open Format_doc in
-      pp_open_stag ppf (Format.String_tag "inline_code");
-      printer ppf x;
-      pp_close_stag ppf ()
-
-    let inline_code ppf s = as_inline_code Format_doc.pp_print_string ppf s
-
-    (* either prints the tag of [s] or delegates to [or_else] *)
-    let mark_open_tag ~or_else s =
-      try
-        let style = style_of_tag s in
-        if !Color.enabled then ansi_of_style_l style.ansi else style.text_open
-      with Not_found -> or_else s
-
-    let mark_close_tag ~or_else s =
-      try
-        let style = style_of_tag s in
-        if !Color.enabled then ansi_of_style_l [Reset] else style.text_close
-      with Not_found -> or_else s
-
-    (* add tag handling to formatter [ppf] *)
-    let set_tag_handling ppf =
-      let open Format in
-      let functions = pp_get_formatter_stag_functions ppf () in
-      let functions' = {functions with
-        mark_open_stag=(mark_open_tag ~or_else:functions.mark_open_stag);
-        mark_close_stag=(mark_close_tag ~or_else:functions.mark_close_stag);
-      } in
-      pp_set_mark_tags ppf true; (* enable tags *)
-      pp_set_formatter_stag_functions ppf functions';
-      ()
-
-    let setup =
-      let first = ref true in (* initialize only once *)
-      let formatter_l =
-        [Format.std_formatter; Format.err_formatter; Format.str_formatter]
-      in
-      let enable_color = function
-        | Color.Auto -> Color.should_enable_color ()
-        | Color.Always -> true
-        | Color.Never -> false
-      in
-      fun o ->
-        if !first then (
-          first := false;
-          Format.set_mark_tags true;
-          List.iter set_tag_handling formatter_l;
-          Color.enabled := (match o with
-            | Some s -> enable_color s
-            | None -> enable_color Color.default_setting)
-        );
-        ()
-  end
-end
+module Misc = Misc_
 
 module Clflags : sig
   val include_dirs : string list ref
diff --git a/vendor/parser-shims/stdlib_shims/ocamlformat_stdlib_shims.ml b/vendor/parser-shims/stdlib_shims/ocamlformat_stdlib_shims.ml
index 3dbd00f7ae..155512c609 100644
--- a/vendor/parser-shims/stdlib_shims/ocamlformat_stdlib_shims.ml
+++ b/vendor/parser-shims/stdlib_shims/ocamlformat_stdlib_shims.ml
@@ -20,3 +20,118 @@ end
 module Either = struct
   type ('a, 'b) t = Left of 'a | Right of 'b
 end
+
+module Uchar = struct
+  include Uchar
+
+  let rep_ = 0xFFFD
+  let rep = unsafe_of_int rep_
+  let valid_bit = 27
+  let decode_bits = 24
+  let[@inline] utf_decode_is_valid d = (d lsr valid_bit) = 1
+  let[@inline] utf_decode_length d = (d lsr decode_bits) land 0b111
+  let[@inline] utf_decode_uchar d = unsafe_of_int (d land 0xFFFFFF)
+  let[@inline] utf_decode n u = ((8 lor n) lsl decode_bits) lor (to_int u)
+  let[@inline] utf_decode_invalid n = (n lsl decode_bits) lor rep_
+end
+
+module Bytes = struct
+  include Bytes
+
+  external unsafe_get_uint8 : bytes -> int -> int = "%bytes_unsafe_get"
+  let[@inline] not_in_x80_to_xBF b = b lsr 6 <> 0b10
+  let[@inline] not_in_xA0_to_xBF b = b lsr 5 <> 0b101
+  let[@inline] not_in_x80_to_x9F b = b lsr 5 <> 0b100
+  let[@inline] not_in_x90_to_xBF b = b < 0x90 || 0xBF < b
+  let[@inline] not_in_x80_to_x8F b = b lsr 4 <> 0x8
+  let dec_invalid = Uchar.utf_decode_invalid
+  let[@inline] dec_ret n u = Uchar.utf_decode n (Uchar.unsafe_of_int u)
+
+  let[@inline] utf_8_uchar_2 b0 b1 =
+    ((b0 land 0x1F) lsl 6) lor
+    ((b1 land 0x3F))
+
+  let[@inline] utf_8_uchar_3 b0 b1 b2 =
+    ((b0 land 0x0F) lsl 12) lor
+    ((b1 land 0x3F) lsl 6) lor
+    ((b2 land 0x3F))
+
+  let[@inline] utf_8_uchar_4 b0 b1 b2 b3 =
+    ((b0 land 0x07) lsl 18) lor
+    ((b1 land 0x3F) lsl 12) lor
+    ((b2 land 0x3F) lsl 6) lor
+    ((b3 land 0x3F))
+
+  let get_utf_8_uchar b i =
+    let b0 = get_uint8 b i in (* raises if [i] is not a valid index. *)
+    let get = unsafe_get_uint8 in
+    let max = length b - 1 in
+    match Char.unsafe_chr b0 with (* See The Unicode Standard, Table 3.7 *)
+    | '\x00' .. '\x7F' -> dec_ret 1 b0
+    | '\xC2' .. '\xDF' ->
+        let i = i + 1 in if i > max then dec_invalid 1 else
+        let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
+        dec_ret 2 (utf_8_uchar_2 b0 b1)
+    | '\xE0' ->
+        let i = i + 1 in if i > max then dec_invalid 1 else
+        let b1 = get b i in if not_in_xA0_to_xBF b1 then dec_invalid 1 else
+        let i = i + 1 in if i > max then dec_invalid 2 else
+        let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
+        dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
+    | '\xE1' .. '\xEC' | '\xEE' .. '\xEF' ->
+        let i = i + 1 in if i > max then dec_invalid 1 else
+        let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
+        let i = i + 1 in if i > max then dec_invalid 2 else
+        let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
+        dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
+    | '\xED' ->
+        let i = i + 1 in if i > max then dec_invalid 1 else
+        let b1 = get b i in if not_in_x80_to_x9F b1 then dec_invalid 1 else
+        let i = i + 1 in if i > max then dec_invalid 2 else
+        let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
+        dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
+    | '\xF0' ->
+        let i = i + 1 in if i > max then dec_invalid 1 else
+        let b1 = get b i in if not_in_x90_to_xBF b1 then dec_invalid 1 else
+        let i = i + 1 in if i > max then dec_invalid 2 else
+        let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
+        let i = i + 1 in if i > max then dec_invalid 3 else
+        let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
+        dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
+    | '\xF1' .. '\xF3' ->
+        let i = i + 1 in if i > max then dec_invalid 1 else
+        let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
+        let i = i + 1 in if i > max then dec_invalid 2 else
+        let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
+        let i = i + 1 in if i > max then dec_invalid 3 else
+        let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
+        dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
+    | '\xF4' ->
+        let i = i + 1 in if i > max then dec_invalid 1 else
+        let b1 = get b i in if not_in_x80_to_x8F b1 then dec_invalid 1 else
+        let i = i + 1 in if i > max then dec_invalid 2 else
+        let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
+        let i = i + 1 in if i > max then dec_invalid 3 else
+        let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
+        dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
+    | _ -> dec_invalid 1
+
+  let for_all p s =
+    let n = length s in
+    let rec loop i =
+      if i = n then true
+      else if p (unsafe_get s i) then loop (succ i)
+      else false in
+    loop 0
+end
+
+module String = struct
+  include String
+  module B = Bytes
+
+  let bos = B.unsafe_of_string
+  let get_utf_8_uchar s i = B.get_utf_8_uchar (bos s) i
+
+  let for_all f s =
+    B.for_all f (bos s)
+end
diff --git a/vendor/parser-standard/lexer.mll b/vendor/parser-standard/lexer.mll
index 7bdeadcd00..5fad8bc9b4 100644
--- a/vendor/parser-standard/lexer.mll
+++ b/vendor/parser-standard/lexer.mll
@@ -29,8 +29,13 @@ type error =
   | Unterminated_string_in_comment of Location.t * Location.t
   | Empty_character_literal
   | Keyword_as_label of string
+  | Capitalized_label of string
   | Invalid_literal of string
   | Invalid_directive of string * string option
+  | Invalid_encoding of string
+  | Invalid_char_in_ident of Uchar.t
+  | Non_lowercase_delimiter of string
+  | Capitalized_raw_identifier of string
 
 exception Error of error * Location.t
 
@@ -255,10 +260,46 @@ let uchar_for_uchar_escape lexbuf =
       illegal_escape lexbuf
         (Printf.sprintf "%X is not a Unicode scalar value" cp)
 
+let validate_encoding lexbuf raw_name =
+  match Utf8_lexeme.normalize raw_name with
+  | Error _ -> error lexbuf (Invalid_encoding raw_name)
+  | Ok name -> name
+
+let ident_for_extended lexbuf raw_name =
+  let name = validate_encoding lexbuf raw_name in
+  match Utf8_lexeme.validate_identifier name with
+  | Utf8_lexeme.Valid -> name
+  | Utf8_lexeme.Invalid_character u -> error lexbuf (Invalid_char_in_ident u)
+  | Utf8_lexeme.Invalid_beginning _ ->
+  assert false (* excluded by the regexps *)
+
+let validate_delim lexbuf raw_name =
+  let name = validate_encoding lexbuf raw_name in
+  if Utf8_lexeme.is_lowercase name then name
+  else error lexbuf (Non_lowercase_delimiter name)
+
+let validate_ext lexbuf name =
+    let name = validate_encoding lexbuf name in
+    match Utf8_lexeme.validate_identifier ~with_dot:true name with
+    | Utf8_lexeme.Valid -> name
+    | Utf8_lexeme.Invalid_character u -> error lexbuf (Invalid_char_in_ident u)
+    | Utf8_lexeme.Invalid_beginning _ ->
+    assert false (* excluded by the regexps *)
+
+let lax_delim raw_name =
+  match Utf8_lexeme.normalize raw_name with
+  | Error _ -> None
+  | Ok name ->
+     if Utf8_lexeme.is_lowercase name then Some name
+     else None
+
 let is_keyword name = Hashtbl.mem keyword_table name
 
-let check_label_name lexbuf name =
-  if is_keyword name then error lexbuf (Keyword_as_label name)
+let check_label_name ?(raw_escape=false) lexbuf name =
+  if Utf8_lexeme.is_capitalized name then
+    error lexbuf (Capitalized_label name);
+  if not raw_escape && is_keyword name then
+    error lexbuf (Keyword_as_label name)
 
 (* To "unlex" a few characters *)
 let set_lexeme_length buf n = (
@@ -309,13 +350,6 @@ let preprocessor = ref None
 
 let escaped_newlines = ref false
 
-(* Warn about Latin-1 characters used in idents *)
-
-let warn_latin1 lexbuf =
-  Location.deprecated
-    (Location.curr lexbuf)
-    "ISO-Latin1 characters in identifiers"
-
 let handle_docstrings = ref true
 let comment_list = ref []
 
@@ -366,6 +400,10 @@ let prepare_error loc = function
   | Keyword_as_label kwd ->
       Location.errorf ~loc
         "%a is a keyword, it cannot be used as label name" Style.inline_code kwd
+  | Capitalized_label lbl ->
+      Location.errorf ~loc
+        "%a cannot be used as label name, \
+         it must start with a lowercase letter" Style.inline_code lbl
   | Invalid_literal s ->
       Location.errorf ~loc "Invalid literal %s" s
   | Invalid_directive (dir, explanation) ->
@@ -373,6 +411,20 @@ let prepare_error loc = function
         (fun ppf -> match explanation with
            | None -> ()
            | Some expl -> fprintf ppf ": %s" expl)
+  | Invalid_encoding s ->
+    Location.errorf ~loc "Invalid encoding of identifier %s." s
+  | Invalid_char_in_ident u ->
+      Location.errorf ~loc "Invalid character U+%X in identifier"
+         (Uchar.to_int u)
+  | Capitalized_raw_identifier lbl ->
+      Location.errorf ~loc
+        "%a cannot be used as a raw identifier, \
+         it must start with a lowercase letter" Style.inline_code lbl
+  | Non_lowercase_delimiter name ->
+      Location.errorf ~loc
+        "%a cannot be used as a quoted string delimiter,@ \
+         it must contain only lowercase letters."
+         Style.inline_code name
 
 let () =
   Location.register_error_of_exn
@@ -389,12 +441,11 @@ let newline = ('\013'* '\010')
 let blank = [' ' '\009' '\012']
 let lowercase = ['a'-'z' '_']
 let uppercase = ['A'-'Z']
+let identstart = lowercase | uppercase
 let identchar = ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9']
-let lowercase_latin1 = ['a'-'z' '\223'-'\246' '\248'-'\255' '_']
-let uppercase_latin1 = ['A'-'Z' '\192'-'\214' '\216'-'\222']
-let identchar_latin1 =
-  ['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '\'' '0'-'9']
-(* This should be kept in sync with the [is_identchar] function in [env.ml] *)
+let utf8 = ['\192'-'\255'] ['\128'-'\191']*
+let identstart_ext = identstart | utf8
+let identchar_ext = identchar | utf8
 
 let symbolchar =
   ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~']
@@ -406,7 +457,8 @@ let kwdopchar =
   ['$' '&' '*' '+' '-' '/' '<' '=' '>' '@' '^' '|']
 
 let ident = (lowercase | uppercase) identchar*
-let extattrident = ident ('.' ident)*
+let ident_ext = identstart_ext  identchar_ext*
+let extattrident = ident_ext ('.' ident_ext)*
 
 let decimal_literal =
   ['0'-'9'] ['0'-'9' '_']*
@@ -449,35 +501,39 @@ rule token = parse
   | ".~"
       { error lexbuf
           (Reserved_sequence (".~", Some "is reserved for use in MetaOCaml")) }
-  | "~" raw_ident_escape (lowercase identchar * as name) ':'
-      { LABEL name }
-  | "~" (lowercase identchar * as name) ':'
+  | "~" (identstart identchar * as name) ':'
       { check_label_name lexbuf name;
         LABEL name }
-  | "~" (lowercase_latin1 identchar_latin1 * as name) ':'
-      { warn_latin1 lexbuf;
+  | "~" (raw_ident_escape? as escape) (ident_ext as raw_name) ':'
+      { let name = ident_for_extended lexbuf raw_name in
+        check_label_name ~raw_escape:(escape<>"") lexbuf name;
         LABEL name }
   | "?"
       { QUESTION }
-  | "?" raw_ident_escape (lowercase identchar * as name) ':'
-      { OPTLABEL name }
   | "?" (lowercase identchar * as name) ':'
       { check_label_name lexbuf name;
         OPTLABEL name }
-  | "?" (lowercase_latin1 identchar_latin1 * as name) ':'
-      { warn_latin1 lexbuf;
-        OPTLABEL name }
-  | raw_ident_escape (lowercase identchar * as name)
-      { LIDENT name }
+  | "?" (raw_ident_escape? as escape) (ident_ext as raw_name) ':'
+      { let name = ident_for_extended lexbuf raw_name in
+        check_label_name ~raw_escape:(escape<>"") lexbuf name;
+        OPTLABEL name
+      }
   | lowercase identchar * as name
       { try Hashtbl.find keyword_table name
         with Not_found -> LIDENT name }
-  | lowercase_latin1 identchar_latin1 * as name
-      { warn_latin1 lexbuf; LIDENT name }
   | uppercase identchar * as name
       { UIDENT name } (* No capitalized keywords *)
-  | uppercase_latin1 identchar_latin1 * as name
-      { warn_latin1 lexbuf; UIDENT name }
+  | (raw_ident_escape? as escape) (ident_ext as raw_name)
+      { let name = ident_for_extended lexbuf raw_name in
+        if Utf8_lexeme.is_capitalized name then begin
+            if escape="" then UIDENT name
+            else
+              (* we don't have capitalized keywords, and thus no needs for
+                 capitalized raw identifiers. *)
+              error lexbuf (Capitalized_raw_identifier name)
+        end else
+          LIDENT name
+      } (* No non-ascii keywords *)
   | int_literal as lit { INT (lit, None) }
   | (int_literal as lit) (literal_modifier as modif)
       { INT (lit, Some modif) }
@@ -490,26 +546,34 @@ rule token = parse
   | "\""
       { let s, loc = wrap_string_lexer string lexbuf in
         STRING (s, loc, None) }
-  | "{" (lowercase* as delim) "|"
-      { let s, loc = wrap_string_lexer (quoted_string delim) lexbuf in
-        STRING (s, loc, Some delim) }
-  | "{%" (extattrident as id) "|"
+  | "{" (ident_ext? as raw_name) '|'
+      { let delim = validate_delim lexbuf raw_name in
+        let s, loc = wrap_string_lexer (quoted_string delim) lexbuf in
+        STRING (s, loc, Some delim)
+       }
+  | "{%" (extattrident as raw_id) "|"
       { let orig_loc = Location.curr lexbuf in
+        let id = validate_ext lexbuf raw_id in
         let s, loc = wrap_string_lexer (quoted_string "") lexbuf in
         let idloc = compute_quoted_string_idloc orig_loc 2 id in
         QUOTED_STRING_EXPR (id, idloc, s, loc, Some "") }
-  | "{%" (extattrident as id) blank+ (lowercase* as delim) "|"
+  | "{%" (extattrident as raw_id) blank+ (ident_ext as raw_delim) "|"
       { let orig_loc = Location.curr lexbuf in
+        let id = validate_ext lexbuf raw_id in
+        let delim = validate_delim lexbuf raw_delim in
         let s, loc = wrap_string_lexer (quoted_string delim) lexbuf in
         let idloc = compute_quoted_string_idloc orig_loc 2 id in
         QUOTED_STRING_EXPR (id, idloc, s, loc, Some delim) }
-  | "{%%" (extattrident as id) "|"
+  | "{%%" (extattrident as raw_id) "|"
       { let orig_loc = Location.curr lexbuf in
+        let id = validate_ext lexbuf raw_id in
         let s, loc = wrap_string_lexer (quoted_string "") lexbuf in
         let idloc = compute_quoted_string_idloc orig_loc 3 id in
         QUOTED_STRING_ITEM (id, idloc, s, loc, Some "") }
-  | "{%%" (extattrident as id) blank+ (lowercase* as delim) "|"
+  | "{%%" (extattrident as raw_id) blank+ (ident_ext as raw_delim) "|"
       { let orig_loc = Location.curr lexbuf in
+        let id = validate_ext lexbuf raw_id in
+        let delim = validate_delim lexbuf raw_delim in
         let s, loc = wrap_string_lexer (quoted_string delim) lexbuf in
         let idloc = compute_quoted_string_idloc orig_loc 3 id in
         QUOTED_STRING_ITEM (id, idloc, s, loc, Some delim) }
@@ -689,8 +753,10 @@ and comment = parse
         is_in_string := false;
         store_string_char '\"';
         comment lexbuf }
-  | "{" ('%' '%'? extattrident blank*)? (lowercase* as delim) "|"
-      {
+  | "{" ('%' '%'? extattrident blank*)? (ident_ext? as raw_delim) "|"
+      { match lax_delim raw_delim with
+        | None -> store_lexeme lexbuf; comment lexbuf
+        | Some delim ->
         string_start_loc := Location.curr lexbuf;
         store_lexeme lexbuf;
         is_in_string := true;
@@ -804,8 +870,9 @@ and quoted_string delim = parse
   | eof
       { is_in_string := false;
         error_loc !string_start_loc Unterminated_string }
-  | "|" (lowercase* as edelim) "}"
+  | "|" (ident_ext? as raw_edelim) "}"
       {
+        let edelim = validate_encoding lexbuf raw_edelim in
         if delim = edelim then lexbuf.lex_start_p
         else (store_lexeme lexbuf; quoted_string delim lexbuf)
       }