Skip to content

Commit

Permalink
[fleche] make encoding for Lang locations to follow protocol-level
Browse files Browse the repository at this point in the history
Following discussion on #624, we now store the Flèche locations on
protocol-level format instead of in unicode character points.

This avoids conversions on all protocol call, conversion back to UTF-8
based encoding is sometimes needed as to manipulate `Contents.t`.
  • Loading branch information
ejgallego committed May 22, 2024
1 parent 3e4b126 commit 73a034e
Show file tree
Hide file tree
Showing 11 changed files with 106 additions and 78 deletions.
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@
(@ejgallgo, #719)
- Center the view if cursor goes out of scope in
`sentenceNext/sentencePrevious` (@ejgallego, #718)
- Switch Flèche range encoding to protocol native, this means UTF-16
for now (Léo Stefanesco, @ejgallego, #624, fixes #620, #621)

# coq-lsp 0.1.8.1: Spring fix
-----------------------------
Expand Down
23 changes: 23 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,29 @@ Some tips:

[ocamlformat]: https://github.com/ocaml-ppx/ocamlformat

### Unicode setup

Flèche stores locations in the protocol-native format. This has the
advantage that conversion is needed only at range creation point
(where we usually have access to the document to perform the
conversion).

This way, we can send ranges to the client without conversion.

Request that work on the raw `Contents.t` buffer must do the inverse
conversion, but we handle this via a proper text API that is aware of
the conversion.

For now, the setup for Coq is:

- protocol-level (and Flèche) encoding: UTF-16 (due to LSP)
- `Contents.t`: UTF-8 (sent to Coq)

It would be very easy to set this parameters at initialization time,
ideal would be for LSP clients to catch up and allow UTF-8 encodings
(so no conversion is needed, at least for Coq), but it seems that we
will take a while to get to this point.

## Client guide (VS Code Extension)

The VS Code extension is setup as a standard `npm` Typescript + React package
Expand Down
34 changes: 29 additions & 5 deletions controller/rq_common.ml
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,33 @@ let get_id_at_point ~contents ~point =
let { Fleche.Contents.lines; _ } = contents in
if line <= Array.length lines then
let line = Array.get lines line in
(* XXX UTF this will fail on unicode chars that differ among UTF-8/16 (cc
#531) *)
match Lang.Utf.utf8_offset_of_char ~line ~char:character with
| None -> None
| Some character -> find_id line character
let character =
Lang.Utf.utf8_offset_of_utf16_offset ~line ~offset:character
in
find_id line character
else None

let validate_line ~(contents : Fleche.Contents.t) ~line =
if Array.length contents.lines > line then
Some (Array.get contents.lines line)
else None

let validate_column char line =
let length = Lang.Utf.length_utf16 line in
if char < length then
let char = Lang.Utf.utf8_offset_of_utf16_offset ~line ~offset:char in
Some (String.get line char)
else None

(* This returns a byte-based char offset for the line *)
let validate_position ~contents ~point =
let line, char = point in
validate_line ~contents ~line |> fun l -> Option.bind l (validate_column char)

let get_char_at_point ~contents ~point =
let line, char = point in
if char >= 1 then
let point = (line, char - 1) in
validate_position ~contents ~point
else (* Can't get previous char *)
None
6 changes: 6 additions & 0 deletions controller/rq_common.mli
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,11 @@
(* Written by: Emilio J. Gallego Arias *)
(************************************************************************)

(* Contents utils, should be moved to Contents.t , they mainly handle character
enconding conversiong between protocol and prover positions, if needed *)

val get_id_at_point :
contents:Fleche.Contents.t -> point:int * int -> string option

val get_char_at_point :
contents:Fleche.Contents.t -> point:int * int -> char option
28 changes: 3 additions & 25 deletions controller/rq_completion.ml
Original file line number Diff line number Diff line change
Expand Up @@ -70,33 +70,11 @@ let unicode_list point : Yojson.Safe.t list =
(* Coq's CList.map is tail-recursive *)
CList.map (mk_unicode_completion_item point) ulist

let validate_line ~(doc : Fleche.Doc.t) ~line =
if Array.length doc.contents.lines > line then
Some (Array.get doc.contents.lines line)
else None

(* This returns a byte-based char offset for the line *)
let validate_position ~doc ~point =
let line, char = point in
Option.map
(fun line ->
let char = Lang.Utf.utf8_offset_of_utf16_offset ~line ~offset:char in
String.get line char)
(validate_line ~doc ~line)

let get_char_at_point ~(doc : Fleche.Doc.t) ~point =
let line, char = point in
if char >= 1 then
let point = (line, char - 1) in
validate_position ~doc ~point
else (* Can't get previous char *)
None

(* point is a utf char! *)
let completion ~token:_ ~doc ~point =
let completion ~token:_ ~(doc : Fleche.Doc.t) ~point =
(* Instead of get_char_at_point we should have a CompletionContext.t, to be
addressed in further completion PRs *)
(match get_char_at_point ~doc ~point with
let contents = doc.contents in
(match Rq_common.get_char_at_point ~contents ~point with
| None ->
let incomplete = true in
let items = [] in
Expand Down
6 changes: 0 additions & 6 deletions controller/rq_hover.ml
Original file line number Diff line number Diff line change
Expand Up @@ -245,12 +245,6 @@ let () = List.iter Register.add [ Loc_info.h; Stats.h; Type.h; Notation.h ]
let hover ~token ~(doc : Fleche.Doc.t) ~point =
let node = Info.LC.node ~doc ~point Exact in
let range = Option.map Doc.Node.range node in
(* XXX: EJGA, ranges should be already converted, why is this needed? *)
let range =
let lines = doc.contents.lines in
(* EGJA: we will improve this soon to make it safer *)
Option.map (Lang.Utf.utf16_range_of_char_range ~lines) range
in
let hovers = Register.fire ~token ~doc ~point ~node in
match hovers with
| [] -> `Null
Expand Down
14 changes: 9 additions & 5 deletions coq/utils.ml
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@

(* We convert in case of failure to some default values *)

let char_of_utf8_offset ~lines ~line ~byte =
let utf16_offset_of_utf8_offset ~lines ~line ~byte =
if line < Array.length lines then
let line = Array.get lines line in
match Lang.Utf.char_of_utf8_offset ~line ~offset:byte with
match Lang.Utf.utf16_offset_of_utf8_offset ~line ~offset:byte with
| Some char -> char
| None -> Lang.Utf.length line
| None -> Lang.Utf.length_utf16 line
else 0

let to_range ~lines (p : Loc.t) : Lang.Range.t =
Expand All @@ -34,8 +34,12 @@ let to_range ~lines (p : Loc.t) : Lang.Range.t =
let start_col = bp - bol_pos in
let end_col = ep - bol_pos_last in

let start_col = char_of_utf8_offset ~lines ~line:start_line ~byte:start_col in
let end_col = char_of_utf8_offset ~lines ~line:end_line ~byte:end_col in
let start_col =
utf16_offset_of_utf8_offset ~lines ~line:start_line ~byte:start_col
in
let end_col =
utf16_offset_of_utf8_offset ~lines ~line:end_line ~byte:end_col
in
Lang.Range.
{ start = { line = start_line; character = start_col; offset = bp }
; end_ = { line = end_line; character = end_col; offset = ep }
Expand Down
2 changes: 1 addition & 1 deletion fleche/contents.ml
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ let get_last_text text =
let lines = CString.split_on_char '\n' text |> Array.of_list in
let n_lines = Array.length lines in
let last_line = if n_lines < 1 then "" else Array.get lines (n_lines - 1) in
let character = Lang.Utf.length last_line in
let character = Lang.Utf.length_utf16 last_line in
(Lang.Point.{ line = n_lines - 1; character; offset }, lines)

let make ~uri ~raw =
Expand Down
4 changes: 1 addition & 3 deletions fleche/doc.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1020,9 +1020,7 @@ let loc_after ~lines ~uri (r : Lang.Range.t) =
let end_index =
let line = Array.get lines r.end_.line in
debug_loc_after line r;
match Lang.Utf.utf8_offset_of_char ~line ~char:r.end_.character with
| None -> String.length line
| Some index -> index
Lang.Utf.utf8_offset_of_utf16_offset ~line ~offset:r.end_.character
in
let bol_pos_last = r.end_.offset - end_index in
{ Loc.fname = init_fname ~uri
Expand Down
19 changes: 7 additions & 12 deletions lang/utf.ml
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,10 @@ let utf16_offset_of_char ~line ~(char : char) =
done;
!offset16

let utf16_offset_of_utf8_offset ~line ~offset =
let char = char_of_utf8_offset ~line ~offset in
Option.map (fun char -> utf16_offset_of_char ~line ~char) char

let%test_unit "utf16 byte offsets" =
let check_last s i = i < String.length s && next s i == String.length s in
let testcases_x =
Expand Down Expand Up @@ -358,15 +362,6 @@ let%test_unit "unicode utf16 offsets" =
line))
testcases

let utf16_point_of_char_point ~(lines : utf8_string array) (point : Point.t) =
(* EJGA: We will make this safer soon, so not a big effort right now to
prevent invalid lines *)
let line = Array.get lines point.line in
let char = utf16_offset_of_char ~line ~char:point.character in
{ point with character = char }

let utf16_range_of_char_range ~(lines : utf8_string array) (range : Range.t) :
Range.t =
{ start = utf16_point_of_char_point ~lines range.start
; end_ = utf16_point_of_char_point ~lines range.end_
}
let length_utf16 line =
let length = length line in
if length > 0 then utf16_offset_of_char ~line ~char:(length - 1) + 1 else 0
46 changes: 25 additions & 21 deletions lang/utf.mli
Original file line number Diff line number Diff line change
Expand Up @@ -37,21 +37,9 @@ type char = int
type utf8_index = int
type utf16_index = int

(** To unicode chars *)

(** Byte index to character position [also called a codepoint], line is enconded
in UTF-8 *)
val char_of_utf8_offset : line:utf8_string -> offset:utf8_index -> char option

(** Get the unicode position of a code point indexed in UTF-16 code units in a
utf-8 encoded utf8_string. Returns the position of the last character if the
utf-16 position was out of bounds. *)
val char_of_utf16_offset : line:utf8_string -> offset:utf16_index -> char

(** To UTF-8 offsets *)

(** UTF-8 Char to byte index position; line is enconded in UTF-8 *)
val utf8_offset_of_char : line:utf8_string -> char:char -> utf8_index option
(** UTF-16 offset from UTF-8 offset; line is enconded in UTF-8 *)
val utf16_offset_of_utf8_offset :
line:utf8_string -> offset:utf8_index -> utf16_index option

(** Get the byte position of a code point indexed in UTF-16 code units in a
UTF-8 encoded utf8_string. Returns the position of the last character if the
Expand All @@ -61,15 +49,31 @@ val utf8_offset_of_utf16_offset :

(** To UTF-16 offsets *)

(** Length in UTF-16 code points *)
val length_utf16 : utf8_string -> utf16_index

(******************************************************)
(** Not used anywhere, remove? *)

(** Number of characters in the utf-8-encoded utf8_string. *)
val length : utf8_string -> char

(** Converstion from char to UTF-8/16 *)

(** UTF-8 Char to byte index position; line is enconded in UTF-8 *)
val utf8_offset_of_char : line:utf8_string -> char:char -> utf8_index option

(** Get the utf16 position of a code point indexed in unicode code points in a
UTF-8 encoded utf8_string. The position must be in bounds. *)
val utf16_offset_of_char : line:utf8_string -> char:int -> utf16_index

(** Number of characters in th utf-8-encoded utf8_string *)
val length : utf8_string -> char
(** Converstion to char from UTF-8/16 *)

(** Translate a Fleche position into an UTF-16 LSP position. *)
val utf16_point_of_char_point : lines:utf8_string array -> Point.t -> Point.t
(** Byte index to character position [also called a codepoint], line is encoded
in UTF-8 *)
val char_of_utf8_offset : line:utf8_string -> offset:utf8_index -> char option

(** Translate a Fleche range into an UTF-16 LSP range. *)
val utf16_range_of_char_range : lines:utf8_string array -> Range.t -> Range.t
(** Get the unicode position of a code point indexed in UTF-16 code units in a
utf-8 encoded utf8_string. Returns the position of the last character if the
utf-16 position was out of bounds. *)
val char_of_utf16_offset : line:utf8_string -> offset:utf16_index -> char

0 comments on commit 73a034e

Please sign in to comment.