Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Follow LSP and treat positions as UTF16 code units #613

Merged
merged 1 commit into from
Nov 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ jobs:
- name: 🐫🐪🐫 Get dependencies
run: opam exec -- make opam-deps

- name: 🐛 Test coq-lsp
- name: 🐛 Special Windows Config [only on Win CI]
if: matrix.os == 'windows-latest'
run: opam exec -- make winconfig

Expand Down
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
@ejgallego, cc: #614)
- error recovery: Recognize `Defined` and `Admitted` in lex recovery
(@ejgallego, #616)
- completion: correctly understand UTF-16 code points on completion
request (Léo Stefanesco, #613, fixes #531)

# coq-lsp 0.1.8: Trick-or-treat
-------------------------------
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ opam-switch:
# Install opam deps
.PHONY: opam-deps
opam-deps:
opam install ./coq-lsp.opam -y --deps-only
opam install ./coq-lsp.opam -y --deps-only --with-test

# Install opam deps
.PHONY: opam-dev-deps
Expand Down
4 changes: 2 additions & 2 deletions controller/rq_completion.ml
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ let validate_line ~(doc : Fleche.Doc.t) ~line =
let validate_position ~doc ~point =
let line, char = point in
Option.bind (validate_line ~doc ~line) (fun line ->
Option.bind (Coq.Utf8.index_of_char ~line ~char) (fun index ->
Some (String.get line index)))
let char = Coq.Utf8.get_byte_offset_from_utf16_pos line char in
Option.bind char (fun index -> Some (String.get line index)))

let get_char_at_point ~(doc : Fleche.Doc.t) ~point =
let line, char = point in
Expand Down
3 changes: 3 additions & 0 deletions coq-lsp.opam
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ depends: [
# waterproof parser
"menhir" { >= "20220210" }

# for unit testing
"alcotest" { >= "1.6.0" & with-test }

# Uncomment this for releases
# "coq" { >= "8.17" < "8.18" }
# "coq-serapi" { >= "8.17" < "8.18" }
Expand Down
120 changes: 120 additions & 0 deletions coq/utf8.ml
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,123 @@ let char_of_index ~line ~byte =
(* | Some char -> Io.Log.trace "get_last_text" (Format.asprintf "char: %d"
char)); *)
char

(* We disabled auto-formatting in copied code *)
[@@@ocamlformat "disable=true"]

(* The following is copied from Ocaml's standard library Bytes and Uchar
modules. We use the public safe variant of various functions, so it should be
slower.

TODO: when our minimum supported Ocaml version is >= 4.14 we shoud switch to
the standard library. *)

(* From Uchar.ml *)
let rep = 0xFFFD
let decode_bits = 24
let[@inline] utf_decode n u = ((8 lor n) lsl decode_bits) lor (Uchar.to_int u)
let[@inline] utf_decode_invalid n = (n lsl decode_bits) lor rep
let[@inline] uchar_utf_decode_uchar d = Uchar.unsafe_of_int (d land 0xFFFFFF)

let uchar_utf_16_byte_length u = match Uchar.to_int u with
| u when u < 0 -> assert false
| u when u <= 0xFFFF -> 2
| u when u <= 0x10FFFF -> 4
| _ -> assert false

(* From bytes.ml *)
let[@inline] not_in_x80_to_xBF b = b lsr 6 <> 0b10
let[@inline] not_in_xA0_to_xBF b = b lsr 5 <> 0b101
let[@inline] not_in_x80_to_x9F b = b lsr 5 <> 0b100
let[@inline] not_in_x90_to_xBF b = b < 0x90 || 0xBF < b
let[@inline] not_in_x80_to_x8F b = b lsr 4 <> 0x8

let[@inline] utf_8_uchar_2 b0 b1 =
((b0 land 0x1F) lsl 6) lor
((b1 land 0x3F))

let[@inline] utf_8_uchar_3 b0 b1 b2 =
((b0 land 0x0F) lsl 12) lor
((b1 land 0x3F) lsl 6) lor
((b2 land 0x3F))

let[@inline] utf_8_uchar_4 b0 b1 b2 b3 =
((b0 land 0x07) lsl 18) lor
((b1 land 0x3F) lsl 12) lor
((b2 land 0x3F) lsl 6) lor
((b3 land 0x3F))

let[@inline] dec_ret n u = utf_decode n (Uchar.unsafe_of_int u)
let dec_invalid = utf_decode_invalid

let string_get_utf_8_uchar s i =
let b = Bytes.unsafe_of_string s in
let b0 = Bytes.get_uint8 b i in (* raises if [i] is not a valid index. *)
let get = Bytes.get_uint8 in
let max = Bytes.length b - 1 in
match Char.unsafe_chr b0 with (* See The Unicode Standard, Table 3.7 *)
| '\x00' .. '\x7F' -> dec_ret 1 b0
| '\xC2' .. '\xDF' ->
let i = i + 1 in if i > max then dec_invalid 1 else
let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
dec_ret 2 (utf_8_uchar_2 b0 b1)
| '\xE0' ->
let i = i + 1 in if i > max then dec_invalid 1 else
let b1 = get b i in if not_in_xA0_to_xBF b1 then dec_invalid 1 else
let i = i + 1 in if i > max then dec_invalid 2 else
let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
| '\xE1' .. '\xEC' | '\xEE' .. '\xEF' ->
let i = i + 1 in if i > max then dec_invalid 1 else
let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
let i = i + 1 in if i > max then dec_invalid 2 else
let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
| '\xED' ->
let i = i + 1 in if i > max then dec_invalid 1 else
let b1 = get b i in if not_in_x80_to_x9F b1 then dec_invalid 1 else
let i = i + 1 in if i > max then dec_invalid 2 else
let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
| '\xF0' ->
let i = i + 1 in if i > max then dec_invalid 1 else
let b1 = get b i in if not_in_x90_to_xBF b1 then dec_invalid 1 else
let i = i + 1 in if i > max then dec_invalid 2 else
let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
let i = i + 1 in if i > max then dec_invalid 3 else
let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
| '\xF1' .. '\xF3' ->
let i = i + 1 in if i > max then dec_invalid 1 else
let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
let i = i + 1 in if i > max then dec_invalid 2 else
let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
let i = i + 1 in if i > max then dec_invalid 3 else
let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
| '\xF4' ->
let i = i + 1 in if i > max then dec_invalid 1 else
let b1 = get b i in if not_in_x80_to_x8F b1 then dec_invalid 1 else
let i = i + 1 in if i > max then dec_invalid 2 else
let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
let i = i + 1 in if i > max then dec_invalid 3 else
let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
| _ -> dec_invalid 1

(* End of copy from Stdlib *)
[@@@ocamlformat "disable=false"]

let get_byte_offset_from_utf16_pos line (char : int) =
let byte_idx = ref 0 in
let utf16_char_count = ref 0 in
while !byte_idx < String.length line && !utf16_char_count < char do
let ch = string_get_utf_8_uchar line !byte_idx in
byte_idx := next line !byte_idx;
let code_unit_count =
uchar_utf_16_byte_length (uchar_utf_decode_uchar ch) / 2
in
utf16_char_count := !utf16_char_count + code_unit_count;
()
done;
if !byte_idx < String.length line then Some !byte_idx else None
4 changes: 4 additions & 0 deletions coq/utf8.mli
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,7 @@ val index_of_char : line:string -> char:char -> index option

(** Lenght in utf-8 chars *)
val length : string -> char

(** Get the byte potition of a code point indexed in UTF-16 code units in a
UTF-8 encoded string. *)
val get_byte_offset_from_utf16_pos : string -> int -> int option
5 changes: 5 additions & 0 deletions examples/utf16.v
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
(* Type \ after the definition of 𝒰 , completion should properly
trigger, also hover for `nat`, and the error diagnostic should be OK
too *)

Definition 𝒰 := nat. Definition u := u.
4 changes: 2 additions & 2 deletions flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,8 @@

packages = l.attrValues {
inherit (config.treefmt.build) wrapper;
inherit (pkgs) dune_3 nodejs;
inherit (ocamlPackages) ocaml ocaml-lsp;
inherit (pkgs) dune_3 nodejs dune-release;
inherit (ocamlPackages) ocaml ocaml-lsp alcotest;
};
};
};
Expand Down
4 changes: 4 additions & 0 deletions test/unit_test/dune
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
(test
(name utf16_pos)
(link_flags -linkall)
(libraries alcotest coq))
30 changes: 30 additions & 0 deletions test/unit_test/utf16_pos.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
(* Build with `ocamlbuild -pkg alcotest simple.byte` *)

open Coq

let testcases_x =
[ ("aax", 2, true)
; (" xoo", 2, true)
; ("0123", 4, false)
; (" 𝒞x", 4, true)
; (" 𝒞x𝒞", 4, true)
; (" 𝒞∫x", 5, true)
; (" 𝒞", 4, false)
; ("∫x.dy", 1, true)
]

(* The tests *)
let test_x () =
List.iter
(fun (s, i, b) ->
let res = Utf8.get_byte_offset_from_utf16_pos s i in
if b then
let res = Option.map (fun i -> s.[i]) res in
Alcotest.(check (option char)) "x present" (Some 'x') res
else Alcotest.(check (option int)) "x present" None res)
testcases_x

(* Run it *)
let () =
let open Alcotest in
run "Controller" [ ("utf16", [ test_case "Find the x" `Quick test_x ]) ]
Loading