[Hover] Follow LSP and treat positions as UTF16 code units

ejgallego · Nov 24, 2023 · feda594 · feda594
1 parent 9a96682
commit feda594
Show file tree

Hide file tree

Showing 8 changed files with 148 additions and 25 deletions.
diff --git a/controller/rq_completion.ml b/controller/rq_completion.ml
@@ -78,9 +78,11 @@ let validate_line ~(doc : Fleche.Doc.t) ~line =
 (* This returns a byte-based char offset for the line *)
 let validate_position ~doc ~point =
   let line, char = point in
-  Option.bind (validate_line ~doc ~line) (fun line ->
+  Option.map
+    (fun line ->
       let char = Coq.Utf8.get_byte_offset_from_utf16_pos line char in
-      Option.bind char (fun index -> Some (String.get line index)))
+      String.get line char)
+    (validate_line ~doc ~line)
 
 let get_char_at_point ~(doc : Fleche.Doc.t) ~point =
   let line, char = point in

diff --git a/controller/rq_hover.ml b/controller/rq_hover.ml
@@ -230,8 +230,10 @@ end
 let () = List.iter Register.add [ Loc_info.h; Stats.h; Type.h; Notation.h ]
 
 let hover ~doc ~point =
+  let point = lsp_point_to_doc_point ~doc point in
   let node = Info.LC.node ~doc ~point Exact in
   let range = Option.map Doc.Node.range node in
+  let range = Option.map (doc_range_to_lsp_range ~doc) range in
   let hovers = Register.fire ~doc ~point ~node in
   match hovers with
   | [] -> `Null

diff --git a/coq/dune b/coq/dune
@@ -4,5 +4,6 @@
  ; Unfortunate we have to link the STM due to the LTAC plugin
  ; depending on it, we should fix this upstream
  (inline_tests)
- (preprocess (pps ppx_inline_test))
+ (preprocess
+  (pps ppx_inline_test))
  (libraries lang coq-core.vernac coq-serapi.serlib))
diff --git a/coq/utf8.ml b/coq/utf8.ml
@@ -179,18 +179,54 @@ let string_get_utf_8_uchar s i =
 let get_byte_offset_from_utf16_pos line (char : int) =
   let byte_idx = ref 0 in
   let utf16_char_count = ref 0 in
-  while !byte_idx < String.length line && !utf16_char_count < char do
-    let ch = string_get_utf_8_uchar line !byte_idx in
-    byte_idx := next line !byte_idx;
-    let code_unit_count =
-      uchar_utf_16_byte_length (uchar_utf_decode_uchar ch) / 2
-    in
-    utf16_char_count := !utf16_char_count + code_unit_count;
-    ()
+  let len = String.length line in
+  (try
+     while !utf16_char_count < char do
+       let ch = string_get_utf_8_uchar line !byte_idx in
+       let next_idx = next line !byte_idx in
+       if next_idx >= len then raise Not_found else byte_idx := next_idx;
+       let code_unit_count =
+         uchar_utf_16_byte_length (uchar_utf_decode_uchar ch) / 2
+       in
+       utf16_char_count := !utf16_char_count + code_unit_count;
+       ()
+     done
+   with _ -> ());
+  !byte_idx
+
+let get_unicode_offset_from_utf16_pos line (char : int) =
+  let byte_idx = ref 0 in
+  let count = ref 0 in
+  let utf16_char_count = ref 0 in
+  let len = String.length line in
+  (try
+     while !utf16_char_count < char do
+       let ch = string_get_utf_8_uchar line !byte_idx in
+       let next_idx = next line !byte_idx in
+       if next_idx >= len then raise Not_found else byte_idx := next_idx;
+       let code_unit_count =
+         uchar_utf_16_byte_length (uchar_utf_decode_uchar ch) / 2
+       in
+       utf16_char_count := !utf16_char_count + code_unit_count;
+       count := !count + 1;
+       ()
+     done
+   with _ -> ());
+  !count
+
+let get_utf16_offset_from_unicode_offset line (char : int) =
+  let offset16 = ref 0 in
+  let idx = ref 0 in
+  for _ = 0 to char - 1 do
+    let ch = string_get_utf_8_uchar line !idx in
+    let byte_len = uchar_utf_16_byte_length (uchar_utf_decode_uchar ch) in
+    offset16 := !offset16 + (byte_len / 2);
+    idx := next line !idx
   done;
-  if !byte_idx < String.length line then Some !byte_idx else None
+  !offset16
 
-let%test_unit "utf16 offsets" =
+let%test_unit "utf16 byte offsets" =
+  let check_last s i = i < String.length s && next s i == String.length s in
   let testcases_x =
     [ ("aax", 2, true)
     ; ("  xoo", 2, true)
@@ -205,15 +241,52 @@ let%test_unit "utf16 offsets" =
   List.iter
     (fun (s, i, b) ->
       let res = get_byte_offset_from_utf16_pos s i in
-      if b then
-        let res = Option.map (fun i -> s.[i]) res in
-        match res with
-        | Some x when x = 'x' -> ()
-        | Some x ->
+      if b then (
+        let res = s.[res] in
+        if res != 'x' then
           failwith
-            (Printf.sprintf "Didn't find x in test %s : %d, instead: %c" s i x)
-        | None ->
-          failwith (Printf.sprintf "Didn't not find x in test %s : %d" s i)
-      else if res != None then
-        failwith (Printf.sprintf "Shouldn't find x in test %s : %d" s i))
+            (Printf.sprintf "Didn't find x in test %s : %d, instead: %c" s i res))
+      else if not (check_last s res) then
+        failwith
+          (Printf.sprintf "Shouldn't find x in test %s / %d got %d" s i res))
     testcases_x
+
+let%test_unit "utf16 unicode offsets" =
+  let testcases =
+    [ ("aax", 2, 2)
+    ; ("  xoo", 2, 2)
+    ; ("0123", 4, 3)
+    ; ("  𝒞x", 4, 3)
+    ; ("  𝒞x𝒞", 4, 3)
+    ; ("  𝒞∫x", 5, 4)
+    ; ("  𝒞", 4, 2)
+    ; ("∫x.dy", 1, 1)
+    ]
+  in
+  List.iter
+    (fun (s, i, e) ->
+      let res = get_unicode_offset_from_utf16_pos s i in
+      if res != e then
+        failwith
+          (Printf.sprintf "Wrong result: got %d expected %d in test %s" res e s))
+    testcases
+
+let%test_unit "unicode utf16 offsets" =
+  let testcases =
+    [ ("aax", 2, 2)
+    ; ("  xoo", 2, 2)
+    ; ("0123", 3, 3)
+    ; ("  𝒞x", 3, 4)
+    ; ("  𝒞x𝒞", 3, 4)
+    ; ("  𝒞∫x", 4, 5)
+    ; ("  𝒞", 2, 2)
+    ; ("∫x.dy", 1, 1)
+    ]
+  in
+  List.iter
+    (fun (s, i, e) ->
+      let res = get_utf16_offset_from_unicode_offset s i in
+      if res != e then
+        failwith
+          (Printf.sprintf "Wrong result: got %d expected %d in test %s" res e s))
+    testcases
diff --git a/coq/utf8.mli b/coq/utf8.mli
@@ -48,5 +48,15 @@ val index_of_char : line:string -> char:char -> index option
 val length : string -> char
 
 (** Get the byte potition of a code point indexed in UTF-16 code units in a
-    UTF-8 encoded string. *)
-val get_byte_offset_from_utf16_pos : string -> int -> int option
+    UTF-8 encoded string. Returns the position of the last character if the
+    UTF-16 position was out of bounds. *)
+val get_byte_offset_from_utf16_pos : string -> int -> int
+
+(** Get the unicode potition of a code point indexed in UTF-16 code units in a
+    utf-8 encoded string. Returns the position of the last character if the
+    utf-16 position was out of bounds. *)
+val get_unicode_offset_from_utf16_pos : string -> int -> int
+
+(** Get the utf16 potition of a code point indexed in unicode code points in a
+    UTF-8 encoded string. The position must be in bounds. *)
+val get_utf16_offset_from_unicode_offset : string -> int -> int
diff --git a/lsp/core.ml b/lsp/core.ml
@@ -160,3 +160,26 @@ module DocumentDiagnosticReportPartialResult = struct
     }
   [@@deriving to_yojson]
 end
+
+
+(* Utility functions to translate between coordinate systems *)
+
+let lsp_point_to_doc_point ~(doc : Fleche.Doc.t) point =
+  let line, char = point in
+  let line_count = Array.length doc.contents.lines in
+  (* lines cannot be empty *)
+  let line = min (line_count - 1) line in
+  let s = Array.get doc.contents.lines line in
+  let char = Coq.Utf8.get_unicode_offset_from_utf16_pos s char in
+  (line, char)
+
+let doc_point_to_lsp_point ~(doc : Fleche.Doc.t) (point : Lang.Point.t) =
+  let s = Array.get doc.contents.lines point.line in
+  let char = Coq.Utf8.get_utf16_offset_from_unicode_offset s point.character in
+  { point with character = char }
+
+let doc_range_to_lsp_range ~(doc : Fleche.Doc.t) (range : Lang.Range.t) :
+    Lang.Range.t =
+  { start = doc_point_to_lsp_point ~doc range.start
+  ; end_ = doc_point_to_lsp_point ~doc range.end_
+  }
diff --git a/lsp/core.mli b/lsp/core.mli
@@ -163,3 +163,14 @@ module DocumentDiagnosticReportPartialResult : sig
     }
   [@@deriving to_yojson]
 end
+
+(** Translate an LSP position into a Fleche position, in Unicode code points. If
+    the LSP positon is out of bounds, the result is capped to the last
+    line/character. *)
+val lsp_point_to_doc_point : doc:Fleche.Doc.t -> int * int -> int * int
+
+(** Translate a Fleche position into an UTF-16 LSP position. *)
+val doc_point_to_lsp_point : doc:Fleche.Doc.t -> Lang.Point.t -> Lang.Point.t
+
+(** Translate a Fleche range into an UTF-16 LSP range. *)
+val doc_range_to_lsp_range : doc:Fleche.Doc.t -> Lang.Range.t -> Lang.Range.t
diff --git a/lsp/jLang.mli b/lsp/jLang.mli
@@ -25,3 +25,4 @@ end
 
 val mk_diagnostics :
   uri:Lang.LUri.File.t -> version:int -> Lang.Diagnostic.t list -> Yojson.Safe.t
+
Original file line number	Diff line number	Diff line change
Expand Up		@@ -25,3 +25,4 @@ end

		val mk_diagnostics :
		uri:Lang.LUri.File.t -> version:int -> Lang.Diagnostic.t list -> Yojson.Safe.t