Skip to content

Commit

Permalink
[utf] [lang] Some more efficient functions and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
ejgallego committed May 22, 2024
1 parent e6cd68c commit 15b12f3
Show file tree
Hide file tree
Showing 5 changed files with 118 additions and 36 deletions.
4 changes: 1 addition & 3 deletions coq/utils.ml
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,7 @@
let utf16_offset_of_utf8_offset ~lines ~line ~byte =
if line < Array.length lines then
let line = Array.get lines line in
match Lang.Utf.utf16_offset_of_utf8_offset ~line ~offset:byte with
| Some char -> char
| None -> Lang.Utf.length_utf16 line
Lang.Utf.utf16_offset_of_utf8_offset ~line ~offset:byte
else 0

let to_range ~lines (p : Loc.t) : Lang.Range.t =
Expand Down
5 changes: 5 additions & 0 deletions examples/unicode1.v
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,8 @@ Goal forall Γ Δ, Γ ⊆ Δ -> P Γ.
(* check goal is updated after the intros here properly *)
intros Γ Δ s.
foo.
Abort.

Goal forall Γ Δ, Γ ⊆ Δ -> P Γ.
(* check goal is updated after the intros here properly *)
intros Γ Δ 𝒞. foo.
92 changes: 62 additions & 30 deletions lang/utf.ml
Original file line number Diff line number Diff line change
Expand Up @@ -112,30 +112,6 @@ let nth s n = nth_aux s 0 n

(* end of camomille *)

(* That's a tricky one, if the char we are requesting is out of bounds, then we
return the last index, 0 in the case line is empty. *)
let utf8_offset_of_char ~line ~char =
if char < length line then Some (nth line char) else None

let find_char line byte =
let rec f index n_chars =
let next_index = next line index in
if next_index > byte then n_chars else f next_index (n_chars + 1)
in
if byte < String.length line then Some (f 0 0) else None

let char_of_utf8_offset ~line ~offset =
(* if Debug.unicode then *)
(* Io.Log.trace "char_of_index" *)
(* (Format.asprintf "str: '%s' | byte: %d" line byte); *)
let char = find_char line offset in
(* (if Debug.unicode then *)
(* match char with *)
(* | None -> Io.Log.trace "get_last_text" "failed" *)
(* | Some char -> Io.Log.trace "get_last_text" (Format.asprintf "char: %d"
char)); *)
char

(* We disabled auto-formatting in copied code *)
[@@@ocamlformat "disable=true"]

Expand Down Expand Up @@ -242,6 +218,21 @@ let string_get_utf_8_uchar s i =
(* End of copy from Stdlib *)
[@@@ocamlformat "disable=false"]

let length_utf16 line =
let byte_idx = ref 0 in
let utf16_len = ref 0 in
let len = String.length line in
while !byte_idx < len do
let ch = string_get_utf_8_uchar line !byte_idx in
let next_idx = next line !byte_idx in
byte_idx := next_idx;
let l = uchar_utf_16_byte_length (uchar_utf_decode_uchar ch) / 2 in
utf16_len := !utf16_len + l
done;
!utf16_len

(* UTF16 <-> UTF8 *)

let utf8_offset_of_utf16_offset ~line ~(offset : utf16_index) =
let byte_idx = ref 0 in
let utf16_char_count = ref 0 in
Expand All @@ -260,6 +251,29 @@ let utf8_offset_of_utf16_offset ~line ~(offset : utf16_index) =
with _ -> ());
!byte_idx

let utf16_offset_of_utf8_offset ~line ~(offset : utf8_index) =
let byte_idx = ref 0 in
let utf16_char_count = ref 0 in
let len = String.length line in
(try
while !byte_idx < offset do
let ch = string_get_utf_8_uchar line !byte_idx in
let next_idx = next line !byte_idx in
if next_idx > len then raise Not_found else byte_idx := next_idx;
let code_unit_count =
uchar_utf_16_byte_length (uchar_utf_decode_uchar ch) / 2
in
utf16_char_count := !utf16_char_count + code_unit_count;
()
done
with _ -> ());
!utf16_char_count

(******************************************************)
(** Not used anywhere, remove? *)
(******************************************************)

(* UTF16 <-> Char *)
let char_of_utf16_offset ~line ~(offset : utf16_index) =
let byte_idx = ref 0 in
let count = ref 0 in
Expand Down Expand Up @@ -291,10 +305,28 @@ let utf16_offset_of_char ~line ~(char : char) =
done;
!offset16

let utf16_offset_of_utf8_offset ~line ~offset =
let char = char_of_utf8_offset ~line ~offset in
Option.map (fun char -> utf16_offset_of_char ~line ~char) char
(* UTF8 <-> Char *)

let length_utf16 line =
let length = length line in
if length > 0 then utf16_offset_of_char ~line ~char:(length - 1) + 1 else 0
(* That's a tricky one, if the char we are requesting is out of bounds, then we
return the last index, 0 in the case line is empty. *)
let utf8_offset_of_char ~line ~char =
if char < length line then Some (nth line char) else None

let find_char line byte =
let rec f index n_chars =
let next_index = next line index in
if next_index > byte then n_chars else f next_index (n_chars + 1)
in
if byte < String.length line then Some (f 0 0) else None

let char_of_utf8_offset ~line ~offset =
(* if Debug.unicode then *)
(* Io.Log.trace "char_of_index" *)
(* (Format.asprintf "str: '%s' | byte: %d" line byte); *)
let char = find_char line offset in
(* (if Debug.unicode then *)
(* match char with *)
(* | None -> Io.Log.trace "get_last_text" "failed" *)
(* | Some char -> Io.Log.trace "get_last_text" (Format.asprintf "char: %d"
char)); *)
char
7 changes: 4 additions & 3 deletions lang/utf.mli
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ module Encoding : sig
| Utf32
end

(* Future work: support multiple encondings *)
(* val set_protocol_encoding : *)
(** Future work: support setting protocol enconding *)
(* val set_protocol_encoding : Enconding.t -> unit *)

(* utf8 utils, both Coq and Camomile have similar implementations, at some point
we should remove this but for now we keep it internal. For now we use the
Expand All @@ -39,7 +39,7 @@ type utf16_index = int

(** UTF-16 offset from UTF-8 offset; line is enconded in UTF-8 *)
val utf16_offset_of_utf8_offset :
line:utf8_string -> offset:utf8_index -> utf16_index option
line:utf8_string -> offset:utf8_index -> utf16_index

(** Get the byte position of a code point indexed in UTF-16 code units in a
UTF-8 encoded utf8_string. Returns the position of the last character if the
Expand All @@ -54,6 +54,7 @@ val length_utf16 : utf8_string -> utf16_index

(******************************************************)
(** Not used anywhere, remove? *)
(******************************************************)

(** Number of characters in the utf-8-encoded utf8_string. *)
val length : utf8_string -> char
Expand Down
46 changes: 46 additions & 0 deletions lang/utf_tests.ml
Original file line number Diff line number Diff line change
@@ -1,5 +1,28 @@
open Lang.Utf

(* 𝒞 = 2 codepoints in UTF-16 *)
let%test_unit "utf16 length" =
let test_cases =
[ ("aax", 3)
; (" xoo", 5)
; ("0123", 4)
; (" 𝒞x", 5)
; (" 𝒞x𝒞", 7)
; (" 𝒞∫x", 6)
; (" 𝒞", 4)
; ("∫x.dy", 5)
; (" 𝒰 ", 4)
]
in
List.iter
(fun (line, expected) ->
let res = length_utf16 line in
if res != expected then
failwith
(Printf.sprintf "Incorrect utf16_length for %s, got: %d expected: %d"
line res expected))
test_cases

let%test_unit "utf16 byte offsets" =
let check_last s i = i < String.length s && next s i == String.length s in
let testcases_x =
Expand All @@ -26,6 +49,29 @@ let%test_unit "utf16 byte offsets" =
(Printf.sprintf "Shouldn't find x in test %s / %d got %d" s i res))
testcases_x

let%test_unit "utf16 unicode offsets" =
(* line, utf8 offset, utf16 offset *)
let testcases =
[ ("aax", 2, 2)
; (" xoo", 2, 2)
; ("0123", 4, 4)
; (" 𝒞x", 4, 4)
; (" 𝒞x𝒞", 4, 4)
; (" 𝒞∫x", 5, 4)
; (" 𝒞", 4, 4)
; ("∫x.dy", 1, 1)
; (" 𝒰 ", 4, 3)
]
in
List.iter
(fun (line, offset, expect) ->
let res = utf16_offset_of_utf8_offset ~line ~offset in
if res != expect then
failwith
(Printf.sprintf "Wrong result: got %d expected %d in test %s" res
expect line))
testcases

let%test_unit "utf16 unicode offsets" =
let testcases =
[ ("aax", 2, 2)
Expand Down

0 comments on commit 15b12f3

Please sign in to comment.