From 73a034edc34fdfe536b03faaeaaa430e440d65d4 Mon Sep 17 00:00:00 2001
From: Emilio Jesus Gallego Arias <e+git@x80.org>
Date: Tue, 21 May 2024 23:17:43 +0200
Subject: [PATCH] [fleche] make encoding for `Lang` locations to follow
 protocol-level
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Following discussion on #624, we now store the Flèche locations on
protocol-level format instead of in unicode character points.

This avoids conversions on all protocol call, conversion back to UTF-8
based encoding is sometimes needed as to manipulate `Contents.t`.
---
 CHANGES.md                  |  2 ++
 CONTRIBUTING.md             | 23 +++++++++++++++++++
 controller/rq_common.ml     | 34 +++++++++++++++++++++++----
 controller/rq_common.mli    |  6 +++++
 controller/rq_completion.ml | 28 +++-------------------
 controller/rq_hover.ml      |  6 -----
 coq/utils.ml                | 14 +++++++----
 fleche/contents.ml          |  2 +-
 fleche/doc.ml               |  4 +---
 lang/utf.ml                 | 19 ++++++---------
 lang/utf.mli                | 46 ++++++++++++++++++++-----------------
 11 files changed, 106 insertions(+), 78 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index 5410dd87b..8428a44eb 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -143,6 +143,8 @@
    (@ejgallgo, #719)
  - Center the view if cursor goes out of scope in
    `sentenceNext/sentencePrevious` (@ejgallego, #718)
+ - Switch Flèche range encoding to protocol native, this means UTF-16
+   for now (Léo Stefanesco, @ejgallego, #624, fixes #620, #621)
 
 # coq-lsp 0.1.8.1: Spring fix
 -----------------------------
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index a2b823df1..b4abacf53 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -218,6 +218,29 @@ Some tips:
 
 [ocamlformat]: https://github.com/ocaml-ppx/ocamlformat
 
+### Unicode setup
+
+Flèche stores locations in the protocol-native format. This has the
+advantage that conversion is needed only at range creation point
+(where we usually have access to the document to perform the
+conversion).
+
+This way, we can send ranges to the client without conversion.
+
+Request that work on the raw `Contents.t` buffer must do the inverse
+conversion, but we handle this via a proper text API that is aware of
+the conversion.
+
+For now, the setup for Coq is:
+
+- protocol-level (and Flèche) encoding: UTF-16 (due to LSP)
+- `Contents.t`: UTF-8 (sent to Coq)
+
+It would be very easy to set this parameters at initialization time,
+ideal would be for LSP clients to catch up and allow UTF-8 encodings
+(so no conversion is needed, at least for Coq), but it seems that we
+will take a while to get to this point.
+
 ## Client guide (VS Code Extension)
 
 The VS Code extension is setup as a standard `npm` Typescript + React package
diff --git a/controller/rq_common.ml b/controller/rq_common.ml
index b158309f1..2a1627afe 100644
--- a/controller/rq_common.ml
+++ b/controller/rq_common.ml
@@ -44,9 +44,33 @@ let get_id_at_point ~contents ~point =
   let { Fleche.Contents.lines; _ } = contents in
   if line <= Array.length lines then
     let line = Array.get lines line in
-    (* XXX UTF this will fail on unicode chars that differ among UTF-8/16 (cc
-       #531) *)
-    match Lang.Utf.utf8_offset_of_char ~line ~char:character with
-    | None -> None
-    | Some character -> find_id line character
+    let character =
+      Lang.Utf.utf8_offset_of_utf16_offset ~line ~offset:character
+    in
+    find_id line character
   else None
+
+let validate_line ~(contents : Fleche.Contents.t) ~line =
+  if Array.length contents.lines > line then
+    Some (Array.get contents.lines line)
+  else None
+
+let validate_column char line =
+  let length = Lang.Utf.length_utf16 line in
+  if char < length then
+    let char = Lang.Utf.utf8_offset_of_utf16_offset ~line ~offset:char in
+    Some (String.get line char)
+  else None
+
+(* This returns a byte-based char offset for the line *)
+let validate_position ~contents ~point =
+  let line, char = point in
+  validate_line ~contents ~line |> fun l -> Option.bind l (validate_column char)
+
+let get_char_at_point ~contents ~point =
+  let line, char = point in
+  if char >= 1 then
+    let point = (line, char - 1) in
+    validate_position ~contents ~point
+  else (* Can't get previous char *)
+    None
diff --git a/controller/rq_common.mli b/controller/rq_common.mli
index 80ab0dde3..a7f7bf68a 100644
--- a/controller/rq_common.mli
+++ b/controller/rq_common.mli
@@ -5,5 +5,11 @@
 (* Written by: Emilio J. Gallego Arias                                  *)
 (************************************************************************)
 
+(* Contents utils, should be moved to Contents.t , they mainly handle character
+   enconding conversiong between protocol and prover positions, if needed *)
+
 val get_id_at_point :
   contents:Fleche.Contents.t -> point:int * int -> string option
+
+val get_char_at_point :
+  contents:Fleche.Contents.t -> point:int * int -> char option
diff --git a/controller/rq_completion.ml b/controller/rq_completion.ml
index ecef3b195..62a8519e4 100644
--- a/controller/rq_completion.ml
+++ b/controller/rq_completion.ml
@@ -70,33 +70,11 @@ let unicode_list point : Yojson.Safe.t list =
   (* Coq's CList.map is tail-recursive *)
   CList.map (mk_unicode_completion_item point) ulist
 
-let validate_line ~(doc : Fleche.Doc.t) ~line =
-  if Array.length doc.contents.lines > line then
-    Some (Array.get doc.contents.lines line)
-  else None
-
-(* This returns a byte-based char offset for the line *)
-let validate_position ~doc ~point =
-  let line, char = point in
-  Option.map
-    (fun line ->
-      let char = Lang.Utf.utf8_offset_of_utf16_offset ~line ~offset:char in
-      String.get line char)
-    (validate_line ~doc ~line)
-
-let get_char_at_point ~(doc : Fleche.Doc.t) ~point =
-  let line, char = point in
-  if char >= 1 then
-    let point = (line, char - 1) in
-    validate_position ~doc ~point
-  else (* Can't get previous char *)
-    None
-
-(* point is a utf char! *)
-let completion ~token:_ ~doc ~point =
+let completion ~token:_ ~(doc : Fleche.Doc.t) ~point =
   (* Instead of get_char_at_point we should have a CompletionContext.t, to be
      addressed in further completion PRs *)
-  (match get_char_at_point ~doc ~point with
+  let contents = doc.contents in
+  (match Rq_common.get_char_at_point ~contents ~point with
   | None ->
     let incomplete = true in
     let items = [] in
diff --git a/controller/rq_hover.ml b/controller/rq_hover.ml
index 34078cb1e..c69849ad7 100644
--- a/controller/rq_hover.ml
+++ b/controller/rq_hover.ml
@@ -245,12 +245,6 @@ let () = List.iter Register.add [ Loc_info.h; Stats.h; Type.h; Notation.h ]
 let hover ~token ~(doc : Fleche.Doc.t) ~point =
   let node = Info.LC.node ~doc ~point Exact in
   let range = Option.map Doc.Node.range node in
-  (* XXX: EJGA, ranges should be already converted, why is this needed? *)
-  let range =
-    let lines = doc.contents.lines in
-    (* EGJA: we will improve this soon to make it safer *)
-    Option.map (Lang.Utf.utf16_range_of_char_range ~lines) range
-  in
   let hovers = Register.fire ~token ~doc ~point ~node in
   match hovers with
   | [] -> `Null
diff --git a/coq/utils.ml b/coq/utils.ml
index 407758e24..33c53b837 100644
--- a/coq/utils.ml
+++ b/coq/utils.ml
@@ -16,12 +16,12 @@
 
 (* We convert in case of failure to some default values *)
 
-let char_of_utf8_offset ~lines ~line ~byte =
+let utf16_offset_of_utf8_offset ~lines ~line ~byte =
   if line < Array.length lines then
     let line = Array.get lines line in
-    match Lang.Utf.char_of_utf8_offset ~line ~offset:byte with
+    match Lang.Utf.utf16_offset_of_utf8_offset ~line ~offset:byte with
     | Some char -> char
-    | None -> Lang.Utf.length line
+    | None -> Lang.Utf.length_utf16 line
   else 0
 
 let to_range ~lines (p : Loc.t) : Lang.Range.t =
@@ -34,8 +34,12 @@ let to_range ~lines (p : Loc.t) : Lang.Range.t =
   let start_col = bp - bol_pos in
   let end_col = ep - bol_pos_last in
 
-  let start_col = char_of_utf8_offset ~lines ~line:start_line ~byte:start_col in
-  let end_col = char_of_utf8_offset ~lines ~line:end_line ~byte:end_col in
+  let start_col =
+    utf16_offset_of_utf8_offset ~lines ~line:start_line ~byte:start_col
+  in
+  let end_col =
+    utf16_offset_of_utf8_offset ~lines ~line:end_line ~byte:end_col
+  in
   Lang.Range.
     { start = { line = start_line; character = start_col; offset = bp }
     ; end_ = { line = end_line; character = end_col; offset = ep }
diff --git a/fleche/contents.ml b/fleche/contents.ml
index 299c259a4..f43687ac2 100644
--- a/fleche/contents.ml
+++ b/fleche/contents.ml
@@ -142,7 +142,7 @@ let get_last_text text =
   let lines = CString.split_on_char '\n' text |> Array.of_list in
   let n_lines = Array.length lines in
   let last_line = if n_lines < 1 then "" else Array.get lines (n_lines - 1) in
-  let character = Lang.Utf.length last_line in
+  let character = Lang.Utf.length_utf16 last_line in
   (Lang.Point.{ line = n_lines - 1; character; offset }, lines)
 
 let make ~uri ~raw =
diff --git a/fleche/doc.ml b/fleche/doc.ml
index 638db4ae8..8744f260d 100644
--- a/fleche/doc.ml
+++ b/fleche/doc.ml
@@ -1020,9 +1020,7 @@ let loc_after ~lines ~uri (r : Lang.Range.t) =
   let end_index =
     let line = Array.get lines r.end_.line in
     debug_loc_after line r;
-    match Lang.Utf.utf8_offset_of_char ~line ~char:r.end_.character with
-    | None -> String.length line
-    | Some index -> index
+    Lang.Utf.utf8_offset_of_utf16_offset ~line ~offset:r.end_.character
   in
   let bol_pos_last = r.end_.offset - end_index in
   { Loc.fname = init_fname ~uri
diff --git a/lang/utf.ml b/lang/utf.ml
index 034fe2f60..d873a48bc 100644
--- a/lang/utf.ml
+++ b/lang/utf.ml
@@ -291,6 +291,10 @@ let utf16_offset_of_char ~line ~(char : char) =
   done;
   !offset16
 
+let utf16_offset_of_utf8_offset ~line ~offset =
+  let char = char_of_utf8_offset ~line ~offset in
+  Option.map (fun char -> utf16_offset_of_char ~line ~char) char
+
 let%test_unit "utf16 byte offsets" =
   let check_last s i = i < String.length s && next s i == String.length s in
   let testcases_x =
@@ -358,15 +362,6 @@ let%test_unit "unicode utf16 offsets" =
              line))
     testcases
 
-let utf16_point_of_char_point ~(lines : utf8_string array) (point : Point.t) =
-  (* EJGA: We will make this safer soon, so not a big effort right now to
-     prevent invalid lines *)
-  let line = Array.get lines point.line in
-  let char = utf16_offset_of_char ~line ~char:point.character in
-  { point with character = char }
-
-let utf16_range_of_char_range ~(lines : utf8_string array) (range : Range.t) :
-    Range.t =
-  { start = utf16_point_of_char_point ~lines range.start
-  ; end_ = utf16_point_of_char_point ~lines range.end_
-  }
+let length_utf16 line =
+  let length = length line in
+  if length > 0 then utf16_offset_of_char ~line ~char:(length - 1) + 1 else 0
diff --git a/lang/utf.mli b/lang/utf.mli
index db1039914..9588697cb 100644
--- a/lang/utf.mli
+++ b/lang/utf.mli
@@ -37,21 +37,9 @@ type char = int
 type utf8_index = int
 type utf16_index = int
 
-(** To unicode chars *)
-
-(** Byte index to character position [also called a codepoint], line is enconded
-    in UTF-8 *)
-val char_of_utf8_offset : line:utf8_string -> offset:utf8_index -> char option
-
-(** Get the unicode position of a code point indexed in UTF-16 code units in a
-    utf-8 encoded utf8_string. Returns the position of the last character if the
-    utf-16 position was out of bounds. *)
-val char_of_utf16_offset : line:utf8_string -> offset:utf16_index -> char
-
-(** To UTF-8 offsets *)
-
-(** UTF-8 Char to byte index position; line is enconded in UTF-8 *)
-val utf8_offset_of_char : line:utf8_string -> char:char -> utf8_index option
+(** UTF-16 offset from UTF-8 offset; line is enconded in UTF-8 *)
+val utf16_offset_of_utf8_offset :
+  line:utf8_string -> offset:utf8_index -> utf16_index option
 
 (** Get the byte position of a code point indexed in UTF-16 code units in a
     UTF-8 encoded utf8_string. Returns the position of the last character if the
@@ -61,15 +49,31 @@ val utf8_offset_of_utf16_offset :
 
 (** To UTF-16 offsets *)
 
+(** Length in UTF-16 code points *)
+val length_utf16 : utf8_string -> utf16_index
+
+(******************************************************)
+(** Not used anywhere, remove? *)
+
+(** Number of characters in the utf-8-encoded utf8_string. *)
+val length : utf8_string -> char
+
+(** Converstion from char to UTF-8/16 *)
+
+(** UTF-8 Char to byte index position; line is enconded in UTF-8 *)
+val utf8_offset_of_char : line:utf8_string -> char:char -> utf8_index option
+
 (** Get the utf16 position of a code point indexed in unicode code points in a
     UTF-8 encoded utf8_string. The position must be in bounds. *)
 val utf16_offset_of_char : line:utf8_string -> char:int -> utf16_index
 
-(** Number of characters in th utf-8-encoded utf8_string *)
-val length : utf8_string -> char
+(** Converstion to char from UTF-8/16 *)
 
-(** Translate a Fleche position into an UTF-16 LSP position. *)
-val utf16_point_of_char_point : lines:utf8_string array -> Point.t -> Point.t
+(** Byte index to character position [also called a codepoint], line is encoded
+    in UTF-8 *)
+val char_of_utf8_offset : line:utf8_string -> offset:utf8_index -> char option
 
-(** Translate a Fleche range into an UTF-16 LSP range. *)
-val utf16_range_of_char_range : lines:utf8_string array -> Range.t -> Range.t
+(** Get the unicode position of a code point indexed in UTF-16 code units in a
+    utf-8 encoded utf8_string. Returns the position of the last character if the
+    utf-16 position was out of bounds. *)
+val char_of_utf16_offset : line:utf8_string -> offset:utf16_index -> char