Skip to content

Commit

Permalink
[completion] Follow LSP and treat positions as UTF16 code units
Browse files Browse the repository at this point in the history
  • Loading branch information
ineol committed Nov 22, 2023
1 parent 8e8fc5a commit ccb5168
Show file tree
Hide file tree
Showing 9 changed files with 168 additions and 6 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ jobs:
- name: 🐫🐪🐫 Get dependencies
run: opam exec -- make opam-deps

- name: 🐛 Test coq-lsp
- name: 🐛 Special Windows Config [only on Win CI]
if: matrix.os == 'windows-latest'
run: opam exec -- make winconfig

Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ opam-switch:
# Install opam deps
.PHONY: opam-deps
opam-deps:
opam install ./coq-lsp.opam -y --deps-only
opam install ./coq-lsp.opam -y --deps-only --with-test

# Install opam deps
.PHONY: opam-dev-deps
Expand Down
4 changes: 2 additions & 2 deletions controller/rq_completion.ml
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ let validate_line ~(doc : Fleche.Doc.t) ~line =
let validate_position ~doc ~point =
let line, char = point in
Option.bind (validate_line ~doc ~line) (fun line ->
Option.bind (Coq.Utf8.index_of_char ~line ~char) (fun index ->
Some (String.get line index)))
let char = Coq.Utf8.get_byte_offset_from_utf16_pos line char in
Option.bind char (fun index -> Some (String.get line index)))

let get_char_at_point ~(doc : Fleche.Doc.t) ~point =
let line, char = point in
Expand Down
3 changes: 3 additions & 0 deletions coq-lsp.opam
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ depends: [
# waterproof parser
"menhir" { >= "20220210" }

# for unit testing
"alcotest" { >= "1.7.0" & with-test }

# Uncomment this for releases
# "coq" { >= "8.17" < "8.18" }
# "coq-serapi" { >= "8.17" < "8.18" }
Expand Down
121 changes: 121 additions & 0 deletions coq/utf8.ml
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,124 @@ let char_of_index ~line ~byte =
(* | Some char -> Io.Log.trace "get_last_text" (Format.asprintf "char: %d"
char)); *)
char

(* The following is copied from Ocaml's standard library Bytes and Uchar
modules. We use the public safe variant of various functions, so it should be
slower.
TODO: when our minimum supported Ocaml version is >= 4.14
we shoud switch to the standard library. *)

(* From Uchar.ml *)
let rep = 0xFFFD
let decode_bits = 24

let[@inline] utf_decode n u = ((8 lor n) lsl decode_bits) lor (Uchar.to_int u)
let[@inline] utf_decode_invalid n = (n lsl decode_bits) lor rep

let[@inline] uchar_utf_decode_uchar d = Uchar.unsafe_of_int (d land 0xFFFFFF)

let uchar_utf_16_byte_length u = match Uchar.to_int u with
| u when u < 0 -> assert false
| u when u <= 0xFFFF -> 2
| u when u <= 0x10FFFF -> 4
| _ -> assert false

(* From bytes.ml *)

let[@inline] not_in_x80_to_xBF b = b lsr 6 <> 0b10
let[@inline] not_in_xA0_to_xBF b = b lsr 5 <> 0b101
let[@inline] not_in_x80_to_x9F b = b lsr 5 <> 0b100
let[@inline] not_in_x90_to_xBF b = b < 0x90 || 0xBF < b
let[@inline] not_in_x80_to_x8F b = b lsr 4 <> 0x8

let[@inline] utf_8_uchar_2 b0 b1 =
((b0 land 0x1F) lsl 6) lor
((b1 land 0x3F))

let[@inline] utf_8_uchar_3 b0 b1 b2 =
((b0 land 0x0F) lsl 12) lor
((b1 land 0x3F) lsl 6) lor
((b2 land 0x3F))

let[@inline] utf_8_uchar_4 b0 b1 b2 b3 =
((b0 land 0x07) lsl 18) lor
((b1 land 0x3F) lsl 12) lor
((b2 land 0x3F) lsl 6) lor
((b3 land 0x3F))

let[@inline] dec_ret n u = utf_decode n (Uchar.unsafe_of_int u)
let dec_invalid = utf_decode_invalid

let string_get_utf_8_uchar s i =
let b = Bytes.unsafe_of_string s in
let b0 = Bytes.get_uint8 b i in (* raises if [i] is not a valid index. *)
let get = Bytes.get_uint8 in
let max = Bytes.length b - 1 in
match Char.unsafe_chr b0 with (* See The Unicode Standard, Table 3.7 *)
| '\x00' .. '\x7F' -> dec_ret 1 b0
| '\xC2' .. '\xDF' ->
let i = i + 1 in if i > max then dec_invalid 1 else
let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
dec_ret 2 (utf_8_uchar_2 b0 b1)
| '\xE0' ->
let i = i + 1 in if i > max then dec_invalid 1 else
let b1 = get b i in if not_in_xA0_to_xBF b1 then dec_invalid 1 else
let i = i + 1 in if i > max then dec_invalid 2 else
let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
| '\xE1' .. '\xEC' | '\xEE' .. '\xEF' ->
let i = i + 1 in if i > max then dec_invalid 1 else
let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
let i = i + 1 in if i > max then dec_invalid 2 else
let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
| '\xED' ->
let i = i + 1 in if i > max then dec_invalid 1 else
let b1 = get b i in if not_in_x80_to_x9F b1 then dec_invalid 1 else
let i = i + 1 in if i > max then dec_invalid 2 else
let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
| '\xF0' ->
let i = i + 1 in if i > max then dec_invalid 1 else
let b1 = get b i in if not_in_x90_to_xBF b1 then dec_invalid 1 else
let i = i + 1 in if i > max then dec_invalid 2 else
let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
let i = i + 1 in if i > max then dec_invalid 3 else
let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
| '\xF1' .. '\xF3' ->
let i = i + 1 in if i > max then dec_invalid 1 else
let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
let i = i + 1 in if i > max then dec_invalid 2 else
let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
let i = i + 1 in if i > max then dec_invalid 3 else
let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
| '\xF4' ->
let i = i + 1 in if i > max then dec_invalid 1 else
let b1 = get b i in if not_in_x80_to_x8F b1 then dec_invalid 1 else
let i = i + 1 in if i > max then dec_invalid 2 else
let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
let i = i + 1 in if i > max then dec_invalid 3 else
let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
| _ -> dec_invalid 1

(* End of copy from Stdlib *)


let get_byte_offset_from_utf16_pos line (char : int) =
let byte_idx = ref 0 in
let utf16_char_count = ref 0 in
while !byte_idx < String.length line && !utf16_char_count < char do
let ch = string_get_utf_8_uchar line !byte_idx in
byte_idx := next line !byte_idx;
let code_unit_count =
uchar_utf_16_byte_length (uchar_utf_decode_uchar ch) / 2
in
utf16_char_count := !utf16_char_count + code_unit_count;
()
done;
if !byte_idx < String.length line then Some !byte_idx else None

4 changes: 4 additions & 0 deletions coq/utf8.mli
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,7 @@ val index_of_char : line:string -> char:char -> index option

(** Lenght in utf-8 chars *)
val length : string -> char

(** Get the byte potition of a code point indexed in UTF-16 code units in a
UTF-8 encoded string. *)
val get_byte_offset_from_utf16_pos : string -> int -> int option
4 changes: 2 additions & 2 deletions flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,8 @@

packages = l.attrValues {
inherit (config.treefmt.build) wrapper;
inherit (pkgs) dune_3 nodejs;
inherit (ocamlPackages) ocaml ocaml-lsp;
inherit (pkgs) dune_3 nodejs dune-release;
inherit (ocamlPackages) ocaml ocaml-lsp alcotest;
};
};
};
Expand Down
4 changes: 4 additions & 0 deletions unit_test/dune
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
(test
(name unittests)
(link_flags -linkall)
(libraries alcotest controller))
30 changes: 30 additions & 0 deletions unit_test/unittests.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
(* Build with `ocamlbuild -pkg alcotest simple.byte` *)

open Coq

let testcases_x =
[ ("aax", 2, true)
; (" xoo", 2, true)
; ("0123", 4, false)
; (" 𝒞x", 4, true)
; (" 𝒞x𝒞", 4, true)
; (" 𝒞∫x", 5, true)
; (" 𝒞", 4, false)
; ("∫x.dy", 1, true)
]

(* The tests *)
let test_x () =
List.iter
(fun (s, i, b) ->
let res = Utf8.get_byte_offset_from_utf16_pos s i in
if b then
let res = Option.map (fun i -> s.[i]) res in
Alcotest.(check (option char)) "x present" (Some 'x') res
else Alcotest.(check (option int)) "x present" None res)
testcases_x

(* Run it *)
let () =
let open Alcotest in
run "Controller" [ ("utf16", [ test_case "Find the x" `Quick test_x ]) ]

0 comments on commit ccb5168

Please sign in to comment.