diff --git a/coq/utils.ml b/coq/utils.ml index 33c53b837..0e6aa7ebe 100644 --- a/coq/utils.ml +++ b/coq/utils.ml @@ -19,9 +19,7 @@ let utf16_offset_of_utf8_offset ~lines ~line ~byte = if line < Array.length lines then let line = Array.get lines line in - match Lang.Utf.utf16_offset_of_utf8_offset ~line ~offset:byte with - | Some char -> char - | None -> Lang.Utf.length_utf16 line + Lang.Utf.utf16_offset_of_utf8_offset ~line ~offset:byte else 0 let to_range ~lines (p : Loc.t) : Lang.Range.t = diff --git a/examples/unicode1.v b/examples/unicode1.v index 276131972..cb0753b94 100644 --- a/examples/unicode1.v +++ b/examples/unicode1.v @@ -7,3 +7,8 @@ Goal forall Γ Δ, Γ ⊆ Δ -> P Γ. (* check goal is updated after the intros here properly *) intros Γ Δ s. foo. +Abort. + +Goal forall Γ Δ, Γ ⊆ Δ -> P Γ. +(* check goal is updated after the intros here properly *) +intros Γ Δ 𝒞. foo. diff --git a/lang/utf.ml b/lang/utf.ml index 048ba170f..a4e780498 100644 --- a/lang/utf.ml +++ b/lang/utf.ml @@ -112,30 +112,6 @@ let nth s n = nth_aux s 0 n (* end of camomille *) -(* That's a tricky one, if the char we are requesting is out of bounds, then we - return the last index, 0 in the case line is empty. *) -let utf8_offset_of_char ~line ~char = - if char < length line then Some (nth line char) else None - -let find_char line byte = - let rec f index n_chars = - let next_index = next line index in - if next_index > byte then n_chars else f next_index (n_chars + 1) - in - if byte < String.length line then Some (f 0 0) else None - -let char_of_utf8_offset ~line ~offset = - (* if Debug.unicode then *) - (* Io.Log.trace "char_of_index" *) - (* (Format.asprintf "str: '%s' | byte: %d" line byte); *) - let char = find_char line offset in - (* (if Debug.unicode then *) - (* match char with *) - (* | None -> Io.Log.trace "get_last_text" "failed" *) - (* | Some char -> Io.Log.trace "get_last_text" (Format.asprintf "char: %d" - char)); *) - char - (* We disabled auto-formatting in copied code *) [@@@ocamlformat "disable=true"] @@ -242,6 +218,21 @@ let string_get_utf_8_uchar s i = (* End of copy from Stdlib *) [@@@ocamlformat "disable=false"] +let length_utf16 line = + let byte_idx = ref 0 in + let utf16_len = ref 0 in + let len = String.length line in + while !byte_idx < len do + let ch = string_get_utf_8_uchar line !byte_idx in + let next_idx = next line !byte_idx in + byte_idx := next_idx; + let l = uchar_utf_16_byte_length (uchar_utf_decode_uchar ch) / 2 in + utf16_len := !utf16_len + l + done; + !utf16_len + +(* UTF16 <-> UTF8 *) + let utf8_offset_of_utf16_offset ~line ~(offset : utf16_index) = let byte_idx = ref 0 in let utf16_char_count = ref 0 in @@ -260,6 +251,29 @@ let utf8_offset_of_utf16_offset ~line ~(offset : utf16_index) = with _ -> ()); !byte_idx +let utf16_offset_of_utf8_offset ~line ~(offset : utf8_index) = + let byte_idx = ref 0 in + let utf16_char_count = ref 0 in + let len = String.length line in + (try + while !byte_idx < offset do + let ch = string_get_utf_8_uchar line !byte_idx in + let next_idx = next line !byte_idx in + if next_idx > len then raise Not_found else byte_idx := next_idx; + let code_unit_count = + uchar_utf_16_byte_length (uchar_utf_decode_uchar ch) / 2 + in + utf16_char_count := !utf16_char_count + code_unit_count; + () + done + with _ -> ()); + !utf16_char_count + +(******************************************************) +(** Not used anywhere, remove? *) +(******************************************************) + +(* UTF16 <-> Char *) let char_of_utf16_offset ~line ~(offset : utf16_index) = let byte_idx = ref 0 in let count = ref 0 in @@ -291,10 +305,28 @@ let utf16_offset_of_char ~line ~(char : char) = done; !offset16 -let utf16_offset_of_utf8_offset ~line ~offset = - let char = char_of_utf8_offset ~line ~offset in - Option.map (fun char -> utf16_offset_of_char ~line ~char) char +(* UTF8 <-> Char *) -let length_utf16 line = - let length = length line in - if length > 0 then utf16_offset_of_char ~line ~char:(length - 1) + 1 else 0 +(* That's a tricky one, if the char we are requesting is out of bounds, then we + return the last index, 0 in the case line is empty. *) +let utf8_offset_of_char ~line ~char = + if char < length line then Some (nth line char) else None + +let find_char line byte = + let rec f index n_chars = + let next_index = next line index in + if next_index > byte then n_chars else f next_index (n_chars + 1) + in + if byte < String.length line then Some (f 0 0) else None + +let char_of_utf8_offset ~line ~offset = + (* if Debug.unicode then *) + (* Io.Log.trace "char_of_index" *) + (* (Format.asprintf "str: '%s' | byte: %d" line byte); *) + let char = find_char line offset in + (* (if Debug.unicode then *) + (* match char with *) + (* | None -> Io.Log.trace "get_last_text" "failed" *) + (* | Some char -> Io.Log.trace "get_last_text" (Format.asprintf "char: %d" + char)); *) + char diff --git a/lang/utf.mli b/lang/utf.mli index 56eafed3e..1bdce61a0 100644 --- a/lang/utf.mli +++ b/lang/utf.mli @@ -21,8 +21,8 @@ module Encoding : sig | Utf32 end -(* Future work: support multiple encondings *) -(* val set_protocol_encoding : *) +(** Future work: support setting protocol enconding *) +(* val set_protocol_encoding : Enconding.t -> unit *) (* utf8 utils, both Coq and Camomile have similar implementations, at some point we should remove this but for now we keep it internal. For now we use the @@ -39,7 +39,7 @@ type utf16_index = int (** UTF-16 offset from UTF-8 offset; line is enconded in UTF-8 *) val utf16_offset_of_utf8_offset : - line:utf8_string -> offset:utf8_index -> utf16_index option + line:utf8_string -> offset:utf8_index -> utf16_index (** Get the byte position of a code point indexed in UTF-16 code units in a UTF-8 encoded utf8_string. Returns the position of the last character if the @@ -54,6 +54,7 @@ val length_utf16 : utf8_string -> utf16_index (******************************************************) (** Not used anywhere, remove? *) +(******************************************************) (** Number of characters in the utf-8-encoded utf8_string. *) val length : utf8_string -> char diff --git a/lang/utf_tests.ml b/lang/utf_tests.ml index 86fb141cc..e90017ec3 100644 --- a/lang/utf_tests.ml +++ b/lang/utf_tests.ml @@ -1,5 +1,28 @@ open Lang.Utf +(* 𝒞 = 2 codepoints in UTF-16 *) +let%test_unit "utf16 length" = + let test_cases = + [ ("aax", 3) + ; (" xoo", 5) + ; ("0123", 4) + ; (" 𝒞x", 5) + ; (" 𝒞x𝒞", 7) + ; (" 𝒞∫x", 6) + ; (" 𝒞", 4) + ; ("∫x.dy", 5) + ; (" 𝒰 ", 4) + ] + in + List.iter + (fun (line, expected) -> + let res = length_utf16 line in + if res != expected then + failwith + (Printf.sprintf "Incorrect utf16_length for %s, got: %d expected: %d" + line res expected)) + test_cases + let%test_unit "utf16 byte offsets" = let check_last s i = i < String.length s && next s i == String.length s in let testcases_x = @@ -26,6 +49,29 @@ let%test_unit "utf16 byte offsets" = (Printf.sprintf "Shouldn't find x in test %s / %d got %d" s i res)) testcases_x +let%test_unit "utf16 unicode offsets" = + (* line, utf8 offset, utf16 offset *) + let testcases = + [ ("aax", 2, 2) + ; (" xoo", 2, 2) + ; ("0123", 4, 4) + ; (" 𝒞x", 4, 4) + ; (" 𝒞x𝒞", 4, 4) + ; (" 𝒞∫x", 5, 4) + ; (" 𝒞", 4, 4) + ; ("∫x.dy", 1, 1) + ; (" 𝒰 ", 4, 3) + ] + in + List.iter + (fun (line, offset, expect) -> + let res = utf16_offset_of_utf8_offset ~line ~offset in + if res != expect then + failwith + (Printf.sprintf "Wrong result: got %d expected %d in test %s" res + expect line)) + testcases + let%test_unit "utf16 unicode offsets" = let testcases = [ ("aax", 2, 2)