[completion] Follow LSP and treat positions as UTF16 code units

ejgallego · Nov 24, 2023 · 8e1824f · 8e1824f
1 parent 8e8fc5a
commit 8e1824f
Show file tree

Hide file tree

Showing 11 changed files with 174 additions and 6 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -61,7 +61,7 @@ jobs:
       - name: 🐫🐪🐫 Get dependencies
         run: opam exec -- make opam-deps
 
-      - name: 🐛 Test coq-lsp
+      - name: 🐛 Special Windows Config [only on Win CI]
         if: matrix.os == 'windows-latest'
         run: opam exec -- make winconfig
 

diff --git a/CHANGES.md b/CHANGES.md
@@ -36,6 +36,8 @@
    @ejgallego, cc: #614)
  - error recovery: Recognize `Defined` and `Admitted` in lex recovery
    (@ejgallego, #616)
+ - completion: correctly understand UTF-16 code points on completion
+   request (Léo Stefanesco, #613, fixes #531)
 
 # coq-lsp 0.1.8: Trick-or-treat
 -------------------------------

diff --git a/Makefile b/Makefile
@@ -91,7 +91,7 @@ opam-switch:
 # Install opam deps
 .PHONY: opam-deps
 opam-deps:
-	opam install ./coq-lsp.opam -y --deps-only
+	opam install ./coq-lsp.opam -y --deps-only --with-test
 
 # Install opam deps
 .PHONY: opam-dev-deps

diff --git a/controller/rq_completion.ml b/controller/rq_completion.ml
@@ -79,8 +79,8 @@ let validate_line ~(doc : Fleche.Doc.t) ~line =
 let validate_position ~doc ~point =
   let line, char = point in
   Option.bind (validate_line ~doc ~line) (fun line ->
-      Option.bind (Coq.Utf8.index_of_char ~line ~char) (fun index ->
-          Some (String.get line index)))
+      let char = Coq.Utf8.get_byte_offset_from_utf16_pos line char in
+      Option.bind char (fun index -> Some (String.get line index)))
 
 let get_char_at_point ~(doc : Fleche.Doc.t) ~point =
   let line, char = point in

diff --git a/coq-lsp.opam b/coq-lsp.opam
@@ -30,6 +30,9 @@ depends: [
   # waterproof parser
   "menhir"       { >= "20220210" }
 
+  # for unit testing
+  "alcotest"     { >= "1.6.0" & with-test }
+
   # Uncomment this for releases
   # "coq"          { >= "8.17" < "8.18"  }
   # "coq-serapi"   { >= "8.17" < "8.18"  }

diff --git a/coq/utf8.ml b/coq/utf8.ml
@@ -69,3 +69,123 @@ let char_of_index ~line ~byte =
   (* | Some char -> Io.Log.trace "get_last_text" (Format.asprintf "char: %d"
      char)); *)
   char
+
+(* We disabled auto-formatting in copied code *)
+[@@@ocamlformat "disable=true"]
+
+(* The following is copied from Ocaml's standard library Bytes and Uchar
+   modules. We use the public safe variant of various functions, so it should be
+   slower.
+
+   TODO: when our minimum supported Ocaml version is >= 4.14 we shoud switch to
+   the standard library. *)
+
+(* From Uchar.ml *)
+let rep = 0xFFFD
+let decode_bits = 24
+let[@inline] utf_decode n u = ((8 lor n) lsl decode_bits) lor (Uchar.to_int u)
+let[@inline] utf_decode_invalid n = (n lsl decode_bits) lor rep
+let[@inline] uchar_utf_decode_uchar d = Uchar.unsafe_of_int (d land 0xFFFFFF)
+
+let uchar_utf_16_byte_length u = match Uchar.to_int u with
+| u when u < 0 -> assert false
+| u when u <= 0xFFFF -> 2
+| u when u <= 0x10FFFF -> 4
+| _ -> assert false
+
+(* From bytes.ml *)
+let[@inline] not_in_x80_to_xBF b = b lsr 6 <> 0b10
+let[@inline] not_in_xA0_to_xBF b = b lsr 5 <> 0b101
+let[@inline] not_in_x80_to_x9F b = b lsr 5 <> 0b100
+let[@inline] not_in_x90_to_xBF b = b < 0x90 || 0xBF < b
+let[@inline] not_in_x80_to_x8F b = b lsr 4 <> 0x8
+
+let[@inline] utf_8_uchar_2 b0 b1 =
+  ((b0 land 0x1F) lsl 6) lor
+  ((b1 land 0x3F))
+
+let[@inline] utf_8_uchar_3 b0 b1 b2 =
+  ((b0 land 0x0F) lsl 12) lor
+  ((b1 land 0x3F) lsl 6) lor
+  ((b2 land 0x3F))
+
+let[@inline] utf_8_uchar_4 b0 b1 b2 b3 =
+  ((b0 land 0x07) lsl 18) lor
+  ((b1 land 0x3F) lsl 12) lor
+  ((b2 land 0x3F) lsl 6) lor
+  ((b3 land 0x3F))
+
+let[@inline] dec_ret n u = utf_decode n (Uchar.unsafe_of_int u)
+let dec_invalid = utf_decode_invalid
+
+let string_get_utf_8_uchar s i =
+  let b = Bytes.unsafe_of_string s in
+  let b0 = Bytes.get_uint8 b i in (* raises if [i] is not a valid index. *)
+  let get = Bytes.get_uint8 in
+  let max = Bytes.length b - 1 in
+  match Char.unsafe_chr b0 with (* See The Unicode Standard, Table 3.7 *)
+  | '\x00' .. '\x7F' -> dec_ret 1 b0
+  | '\xC2' .. '\xDF' ->
+      let i = i + 1 in if i > max then dec_invalid 1 else
+      let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
+      dec_ret 2 (utf_8_uchar_2 b0 b1)
+  | '\xE0' ->
+      let i = i + 1 in if i > max then dec_invalid 1 else
+      let b1 = get b i in if not_in_xA0_to_xBF b1 then dec_invalid 1 else
+      let i = i + 1 in if i > max then dec_invalid 2 else
+      let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
+      dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
+  | '\xE1' .. '\xEC' | '\xEE' .. '\xEF' ->
+      let i = i + 1 in if i > max then dec_invalid 1 else
+      let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
+      let i = i + 1 in if i > max then dec_invalid 2 else
+      let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
+      dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
+  | '\xED' ->
+      let i = i + 1 in if i > max then dec_invalid 1 else
+      let b1 = get b i in if not_in_x80_to_x9F b1 then dec_invalid 1 else
+      let i = i + 1 in if i > max then dec_invalid 2 else
+      let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
+      dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
+  | '\xF0' ->
+      let i = i + 1 in if i > max then dec_invalid 1 else
+      let b1 = get b i in if not_in_x90_to_xBF b1 then dec_invalid 1 else
+      let i = i + 1 in if i > max then dec_invalid 2 else
+      let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
+      let i = i + 1 in if i > max then dec_invalid 3 else
+      let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
+      dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
+  | '\xF1' .. '\xF3' ->
+      let i = i + 1 in if i > max then dec_invalid 1 else
+      let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
+      let i = i + 1 in if i > max then dec_invalid 2 else
+      let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
+      let i = i + 1 in if i > max then dec_invalid 3 else
+      let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
+      dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
+  | '\xF4' ->
+      let i = i + 1 in if i > max then dec_invalid 1 else
+      let b1 = get b i in if not_in_x80_to_x8F b1 then dec_invalid 1 else
+      let i = i + 1 in if i > max then dec_invalid 2 else
+      let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
+      let i = i + 1 in if i > max then dec_invalid 3 else
+      let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
+      dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
+  | _ -> dec_invalid 1
+
+(* End of copy from Stdlib *)
+[@@@ocamlformat "disable=false"]
+
+let get_byte_offset_from_utf16_pos line (char : int) =
+  let byte_idx = ref 0 in
+  let utf16_char_count = ref 0 in
+  while !byte_idx < String.length line && !utf16_char_count < char do
+    let ch = string_get_utf_8_uchar line !byte_idx in
+    byte_idx := next line !byte_idx;
+    let code_unit_count =
+      uchar_utf_16_byte_length (uchar_utf_decode_uchar ch) / 2
+    in
+    utf16_char_count := !utf16_char_count + code_unit_count;
+    ()
+  done;
+  if !byte_idx < String.length line then Some !byte_idx else None
diff --git a/coq/utf8.mli b/coq/utf8.mli
@@ -46,3 +46,7 @@ val index_of_char : line:string -> char:char -> index option
 
 (** Lenght in utf-8 chars *)
 val length : string -> char
+
+(** Get the byte potition of a code point indexed in UTF-16 code units in a
+    UTF-8 encoded string. *)
+val get_byte_offset_from_utf16_pos : string -> int -> int option
diff --git a/examples/utf16.v b/examples/utf16.v
@@ -0,0 +1,5 @@
+(* Type \ after the definition of 𝒰 , completion should properly
+trigger, also hover for `nat`, and the error diagnostic should be OK
+too *)
+
+Definition 𝒰 := nat. Definition u := u.
diff --git a/flake.nix b/flake.nix
@@ -89,8 +89,8 @@
 
           packages = l.attrValues {
             inherit (config.treefmt.build) wrapper;
-            inherit (pkgs) dune_3 nodejs;
-            inherit (ocamlPackages) ocaml ocaml-lsp;
+            inherit (pkgs) dune_3 nodejs dune-release;
+            inherit (ocamlPackages) ocaml ocaml-lsp alcotest;
           };
         };
       };

diff --git a/test/unit_test/dune b/test/unit_test/dune
@@ -0,0 +1,4 @@
+(test
+ (name utf16_pos)
+ (link_flags -linkall)
+ (libraries alcotest coq))
diff --git a/test/unit_test/utf16_pos.ml b/test/unit_test/utf16_pos.ml
@@ -0,0 +1,30 @@
+(* Build with `ocamlbuild -pkg alcotest simple.byte` *)
+
+open Coq
+
+let testcases_x =
+  [ ("aax", 2, true)
+  ; ("  xoo", 2, true)
+  ; ("0123", 4, false)
+  ; ("  𝒞x", 4, true)
+  ; ("  𝒞x𝒞", 4, true)
+  ; ("  𝒞∫x", 5, true)
+  ; ("  𝒞", 4, false)
+  ; ("∫x.dy", 1, true)
+  ]
+
+(* The tests *)
+let test_x () =
+  List.iter
+    (fun (s, i, b) ->
+      let res = Utf8.get_byte_offset_from_utf16_pos s i in
+      if b then
+        let res = Option.map (fun i -> s.[i]) res in
+        Alcotest.(check (option char)) "x present" (Some 'x') res
+      else Alcotest.(check (option int)) "x present" None res)
+    testcases_x
+
+(* Run it *)
+let () =
+  let open Alcotest in
+  run "Controller" [ ("utf16", [ test_case "Find the x" `Quick test_x ]) ]