Skip to content

Commit

Permalink
Fix unicode classification of non-spacing marks.
Browse files Browse the repository at this point in the history
This commit adds the ~2000 non-spacing marks into the IdentPart category.
This includes all the combining marks, and thus fixes coq#19512. This also
means that characters in the range 1DC0-1DFF can no longer appear at the
start of an identifier (which does not make sense anyway, as they are
combining marks).

This commit also fixes a few exceptions, which were actually no exception:
- the dot is already in Symbol,
- phonetic extensions are already in Letter.
  • Loading branch information
silene committed Oct 15, 2024
1 parent a79fb09 commit b1ba7e7
Showing 1 changed file with 2 additions and 11 deletions.
13 changes: 2 additions & 11 deletions clib/unicode.ml
Original file line number Diff line number Diff line change
Expand Up @@ -98,25 +98,16 @@ let classify =
Unicodetable.nd; (* Number, decimal digits. *)
Unicodetable.nl; (* Number, letter. *)
Unicodetable.no; (* Number, other. *)
Unicodetable.mn; (* Non-spacing marks. *)
];

(* Workaround. Some characters seems to be missing in
Camomile's category tables. We add them manually. *)
mk_lookup_table_from_unicode_tables_for Letter
[
[(0x01D00, 0x01D7F)]; (* Phonetic Extensions. *)
[(0x01D80, 0x01DBF)]; (* Phonetic Extensions Suppl. *)
[(0x01DC0, 0x01DFF)]; (* Combining Diacritical Marks Suppl.*)
];

(* Exceptions (from a previous version of this function). *)
(* Exceptions from Number, other. *)
mk_lookup_table_from_unicode_tables_for Symbol
[
[(0x000B2, 0x000B3)]; (* Superscript 2-3. *)
single 0x000B9; (* Superscript 1. *)
single 0x02070; (* Superscript 0. *)
[(0x02074, 0x02079)]; (* Superscript 4-9. *)
single 0x0002E; (* Dot. *)
];
mk_lookup_table_from_unicode_tables_for Separator
[
Expand Down

0 comments on commit b1ba7e7

Please sign in to comment.