From 73a034edc34fdfe536b03faaeaaa430e440d65d4 Mon Sep 17 00:00:00 2001 From: Emilio Jesus Gallego Arias Date: Tue, 21 May 2024 23:17:43 +0200 Subject: [PATCH] [fleche] make encoding for `Lang` locations to follow protocol-level MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Following discussion on #624, we now store the Flèche locations on protocol-level format instead of in unicode character points. This avoids conversions on all protocol call, conversion back to UTF-8 based encoding is sometimes needed as to manipulate `Contents.t`. --- CHANGES.md | 2 ++ CONTRIBUTING.md | 23 +++++++++++++++++++ controller/rq_common.ml | 34 +++++++++++++++++++++++---- controller/rq_common.mli | 6 +++++ controller/rq_completion.ml | 28 +++------------------- controller/rq_hover.ml | 6 ----- coq/utils.ml | 14 +++++++---- fleche/contents.ml | 2 +- fleche/doc.ml | 4 +--- lang/utf.ml | 19 ++++++--------- lang/utf.mli | 46 ++++++++++++++++++++----------------- 11 files changed, 106 insertions(+), 78 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 5410dd87b..8428a44eb 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -143,6 +143,8 @@ (@ejgallgo, #719) - Center the view if cursor goes out of scope in `sentenceNext/sentencePrevious` (@ejgallego, #718) + - Switch Flèche range encoding to protocol native, this means UTF-16 + for now (Léo Stefanesco, @ejgallego, #624, fixes #620, #621) # coq-lsp 0.1.8.1: Spring fix ----------------------------- diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a2b823df1..b4abacf53 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -218,6 +218,29 @@ Some tips: [ocamlformat]: https://github.com/ocaml-ppx/ocamlformat +### Unicode setup + +Flèche stores locations in the protocol-native format. This has the +advantage that conversion is needed only at range creation point +(where we usually have access to the document to perform the +conversion). + +This way, we can send ranges to the client without conversion. + +Request that work on the raw `Contents.t` buffer must do the inverse +conversion, but we handle this via a proper text API that is aware of +the conversion. + +For now, the setup for Coq is: + +- protocol-level (and Flèche) encoding: UTF-16 (due to LSP) +- `Contents.t`: UTF-8 (sent to Coq) + +It would be very easy to set this parameters at initialization time, +ideal would be for LSP clients to catch up and allow UTF-8 encodings +(so no conversion is needed, at least for Coq), but it seems that we +will take a while to get to this point. + ## Client guide (VS Code Extension) The VS Code extension is setup as a standard `npm` Typescript + React package diff --git a/controller/rq_common.ml b/controller/rq_common.ml index b158309f1..2a1627afe 100644 --- a/controller/rq_common.ml +++ b/controller/rq_common.ml @@ -44,9 +44,33 @@ let get_id_at_point ~contents ~point = let { Fleche.Contents.lines; _ } = contents in if line <= Array.length lines then let line = Array.get lines line in - (* XXX UTF this will fail on unicode chars that differ among UTF-8/16 (cc - #531) *) - match Lang.Utf.utf8_offset_of_char ~line ~char:character with - | None -> None - | Some character -> find_id line character + let character = + Lang.Utf.utf8_offset_of_utf16_offset ~line ~offset:character + in + find_id line character else None + +let validate_line ~(contents : Fleche.Contents.t) ~line = + if Array.length contents.lines > line then + Some (Array.get contents.lines line) + else None + +let validate_column char line = + let length = Lang.Utf.length_utf16 line in + if char < length then + let char = Lang.Utf.utf8_offset_of_utf16_offset ~line ~offset:char in + Some (String.get line char) + else None + +(* This returns a byte-based char offset for the line *) +let validate_position ~contents ~point = + let line, char = point in + validate_line ~contents ~line |> fun l -> Option.bind l (validate_column char) + +let get_char_at_point ~contents ~point = + let line, char = point in + if char >= 1 then + let point = (line, char - 1) in + validate_position ~contents ~point + else (* Can't get previous char *) + None diff --git a/controller/rq_common.mli b/controller/rq_common.mli index 80ab0dde3..a7f7bf68a 100644 --- a/controller/rq_common.mli +++ b/controller/rq_common.mli @@ -5,5 +5,11 @@ (* Written by: Emilio J. Gallego Arias *) (************************************************************************) +(* Contents utils, should be moved to Contents.t , they mainly handle character + enconding conversiong between protocol and prover positions, if needed *) + val get_id_at_point : contents:Fleche.Contents.t -> point:int * int -> string option + +val get_char_at_point : + contents:Fleche.Contents.t -> point:int * int -> char option diff --git a/controller/rq_completion.ml b/controller/rq_completion.ml index ecef3b195..62a8519e4 100644 --- a/controller/rq_completion.ml +++ b/controller/rq_completion.ml @@ -70,33 +70,11 @@ let unicode_list point : Yojson.Safe.t list = (* Coq's CList.map is tail-recursive *) CList.map (mk_unicode_completion_item point) ulist -let validate_line ~(doc : Fleche.Doc.t) ~line = - if Array.length doc.contents.lines > line then - Some (Array.get doc.contents.lines line) - else None - -(* This returns a byte-based char offset for the line *) -let validate_position ~doc ~point = - let line, char = point in - Option.map - (fun line -> - let char = Lang.Utf.utf8_offset_of_utf16_offset ~line ~offset:char in - String.get line char) - (validate_line ~doc ~line) - -let get_char_at_point ~(doc : Fleche.Doc.t) ~point = - let line, char = point in - if char >= 1 then - let point = (line, char - 1) in - validate_position ~doc ~point - else (* Can't get previous char *) - None - -(* point is a utf char! *) -let completion ~token:_ ~doc ~point = +let completion ~token:_ ~(doc : Fleche.Doc.t) ~point = (* Instead of get_char_at_point we should have a CompletionContext.t, to be addressed in further completion PRs *) - (match get_char_at_point ~doc ~point with + let contents = doc.contents in + (match Rq_common.get_char_at_point ~contents ~point with | None -> let incomplete = true in let items = [] in diff --git a/controller/rq_hover.ml b/controller/rq_hover.ml index 34078cb1e..c69849ad7 100644 --- a/controller/rq_hover.ml +++ b/controller/rq_hover.ml @@ -245,12 +245,6 @@ let () = List.iter Register.add [ Loc_info.h; Stats.h; Type.h; Notation.h ] let hover ~token ~(doc : Fleche.Doc.t) ~point = let node = Info.LC.node ~doc ~point Exact in let range = Option.map Doc.Node.range node in - (* XXX: EJGA, ranges should be already converted, why is this needed? *) - let range = - let lines = doc.contents.lines in - (* EGJA: we will improve this soon to make it safer *) - Option.map (Lang.Utf.utf16_range_of_char_range ~lines) range - in let hovers = Register.fire ~token ~doc ~point ~node in match hovers with | [] -> `Null diff --git a/coq/utils.ml b/coq/utils.ml index 407758e24..33c53b837 100644 --- a/coq/utils.ml +++ b/coq/utils.ml @@ -16,12 +16,12 @@ (* We convert in case of failure to some default values *) -let char_of_utf8_offset ~lines ~line ~byte = +let utf16_offset_of_utf8_offset ~lines ~line ~byte = if line < Array.length lines then let line = Array.get lines line in - match Lang.Utf.char_of_utf8_offset ~line ~offset:byte with + match Lang.Utf.utf16_offset_of_utf8_offset ~line ~offset:byte with | Some char -> char - | None -> Lang.Utf.length line + | None -> Lang.Utf.length_utf16 line else 0 let to_range ~lines (p : Loc.t) : Lang.Range.t = @@ -34,8 +34,12 @@ let to_range ~lines (p : Loc.t) : Lang.Range.t = let start_col = bp - bol_pos in let end_col = ep - bol_pos_last in - let start_col = char_of_utf8_offset ~lines ~line:start_line ~byte:start_col in - let end_col = char_of_utf8_offset ~lines ~line:end_line ~byte:end_col in + let start_col = + utf16_offset_of_utf8_offset ~lines ~line:start_line ~byte:start_col + in + let end_col = + utf16_offset_of_utf8_offset ~lines ~line:end_line ~byte:end_col + in Lang.Range. { start = { line = start_line; character = start_col; offset = bp } ; end_ = { line = end_line; character = end_col; offset = ep } diff --git a/fleche/contents.ml b/fleche/contents.ml index 299c259a4..f43687ac2 100644 --- a/fleche/contents.ml +++ b/fleche/contents.ml @@ -142,7 +142,7 @@ let get_last_text text = let lines = CString.split_on_char '\n' text |> Array.of_list in let n_lines = Array.length lines in let last_line = if n_lines < 1 then "" else Array.get lines (n_lines - 1) in - let character = Lang.Utf.length last_line in + let character = Lang.Utf.length_utf16 last_line in (Lang.Point.{ line = n_lines - 1; character; offset }, lines) let make ~uri ~raw = diff --git a/fleche/doc.ml b/fleche/doc.ml index 638db4ae8..8744f260d 100644 --- a/fleche/doc.ml +++ b/fleche/doc.ml @@ -1020,9 +1020,7 @@ let loc_after ~lines ~uri (r : Lang.Range.t) = let end_index = let line = Array.get lines r.end_.line in debug_loc_after line r; - match Lang.Utf.utf8_offset_of_char ~line ~char:r.end_.character with - | None -> String.length line - | Some index -> index + Lang.Utf.utf8_offset_of_utf16_offset ~line ~offset:r.end_.character in let bol_pos_last = r.end_.offset - end_index in { Loc.fname = init_fname ~uri diff --git a/lang/utf.ml b/lang/utf.ml index 034fe2f60..d873a48bc 100644 --- a/lang/utf.ml +++ b/lang/utf.ml @@ -291,6 +291,10 @@ let utf16_offset_of_char ~line ~(char : char) = done; !offset16 +let utf16_offset_of_utf8_offset ~line ~offset = + let char = char_of_utf8_offset ~line ~offset in + Option.map (fun char -> utf16_offset_of_char ~line ~char) char + let%test_unit "utf16 byte offsets" = let check_last s i = i < String.length s && next s i == String.length s in let testcases_x = @@ -358,15 +362,6 @@ let%test_unit "unicode utf16 offsets" = line)) testcases -let utf16_point_of_char_point ~(lines : utf8_string array) (point : Point.t) = - (* EJGA: We will make this safer soon, so not a big effort right now to - prevent invalid lines *) - let line = Array.get lines point.line in - let char = utf16_offset_of_char ~line ~char:point.character in - { point with character = char } - -let utf16_range_of_char_range ~(lines : utf8_string array) (range : Range.t) : - Range.t = - { start = utf16_point_of_char_point ~lines range.start - ; end_ = utf16_point_of_char_point ~lines range.end_ - } +let length_utf16 line = + let length = length line in + if length > 0 then utf16_offset_of_char ~line ~char:(length - 1) + 1 else 0 diff --git a/lang/utf.mli b/lang/utf.mli index db1039914..9588697cb 100644 --- a/lang/utf.mli +++ b/lang/utf.mli @@ -37,21 +37,9 @@ type char = int type utf8_index = int type utf16_index = int -(** To unicode chars *) - -(** Byte index to character position [also called a codepoint], line is enconded - in UTF-8 *) -val char_of_utf8_offset : line:utf8_string -> offset:utf8_index -> char option - -(** Get the unicode position of a code point indexed in UTF-16 code units in a - utf-8 encoded utf8_string. Returns the position of the last character if the - utf-16 position was out of bounds. *) -val char_of_utf16_offset : line:utf8_string -> offset:utf16_index -> char - -(** To UTF-8 offsets *) - -(** UTF-8 Char to byte index position; line is enconded in UTF-8 *) -val utf8_offset_of_char : line:utf8_string -> char:char -> utf8_index option +(** UTF-16 offset from UTF-8 offset; line is enconded in UTF-8 *) +val utf16_offset_of_utf8_offset : + line:utf8_string -> offset:utf8_index -> utf16_index option (** Get the byte position of a code point indexed in UTF-16 code units in a UTF-8 encoded utf8_string. Returns the position of the last character if the @@ -61,15 +49,31 @@ val utf8_offset_of_utf16_offset : (** To UTF-16 offsets *) +(** Length in UTF-16 code points *) +val length_utf16 : utf8_string -> utf16_index + +(******************************************************) +(** Not used anywhere, remove? *) + +(** Number of characters in the utf-8-encoded utf8_string. *) +val length : utf8_string -> char + +(** Converstion from char to UTF-8/16 *) + +(** UTF-8 Char to byte index position; line is enconded in UTF-8 *) +val utf8_offset_of_char : line:utf8_string -> char:char -> utf8_index option + (** Get the utf16 position of a code point indexed in unicode code points in a UTF-8 encoded utf8_string. The position must be in bounds. *) val utf16_offset_of_char : line:utf8_string -> char:int -> utf16_index -(** Number of characters in th utf-8-encoded utf8_string *) -val length : utf8_string -> char +(** Converstion to char from UTF-8/16 *) -(** Translate a Fleche position into an UTF-16 LSP position. *) -val utf16_point_of_char_point : lines:utf8_string array -> Point.t -> Point.t +(** Byte index to character position [also called a codepoint], line is encoded + in UTF-8 *) +val char_of_utf8_offset : line:utf8_string -> offset:utf8_index -> char option -(** Translate a Fleche range into an UTF-16 LSP range. *) -val utf16_range_of_char_range : lines:utf8_string array -> Range.t -> Range.t +(** Get the unicode position of a code point indexed in UTF-16 code units in a + utf-8 encoded utf8_string. Returns the position of the last character if the + utf-16 position was out of bounds. *) +val char_of_utf16_offset : line:utf8_string -> offset:utf16_index -> char