Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: handle utf8 characters in the dune files(#9728) #10113

Merged
merged 5 commits into from
Mar 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion boot/libs.ml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ let local_libraries =
; ("vendor/fiber/src", Some "Fiber", false, None)
; ("src/dune_console", Some "Dune_console", false, None)
; ("src/memo", Some "Memo", false, None)
; ("vendor/uutf", None, false, None)
; ("src/dune_sexp", Some "Dune_sexp", false, None)
; ("src/ocaml-config", Some "Ocaml_config", false, None)
; ("src/ocaml", Some "Ocaml", false, None)
Expand All @@ -23,7 +24,6 @@ let local_libraries =
; ("otherlibs/dune-rpc/private", Some "Dune_rpc_private", false, None)
; ("src/dune_config", Some "Dune_config", false, None)
; ("vendor/sha", None, false, None)
; ("vendor/uutf", None, false, None)
; ("vendor/opam/src/core", None, false, None)
; ("vendor/opam-file-format", None, false, None)
; ("vendor/opam/src/format", None, false, None)
Expand Down
2 changes: 2 additions & 0 deletions doc/changes/10113.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
- dune file formatting: output utf8 if input is correctly encoded (#10113,
fixes #9728, @moyodiallo)
2 changes: 1 addition & 1 deletion src/dune_sexp/dune
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
(library
(name dune_sexp)
(synopsis "[Internal] S-expression library")
(libraries stdune)
(libraries stdune dune_uutf)
(instrumentation
(backend bisect_ppx)))

Expand Down
92 changes: 71 additions & 21 deletions src/dune_sexp/escape.ml
Original file line number Diff line number Diff line change
@@ -1,26 +1,64 @@
open! Stdune

(** Note: on OCaml >= 4.14, this can be switched to the following (and the
dependency to [Uutf] can be removed)

{[
let next_valid_utf8_length s i =
let decode = String.get_utf_8_uchar s i in
Option.some_if (Uchar.utf_decode_is_valid decode) (Uchar.utf_decode_length decode)
;;
]} *)
let next_valid_utf8_uchar_len s i =
let pos = ref i in
let buf = Bytes.create 1 in
let decoder = Uutf.decoder ~encoding:`UTF_8 `Manual in
let rec go () =
match Uutf.decode decoder with
| `Await ->
if !pos >= String.length s
then None
else (
Bytes.set buf 0 (String.get s !pos);
incr pos;
Uutf.Manual.src decoder buf 0 1;
go ())
| `Uchar _ -> Some (!pos - i)
| `Malformed _ -> None
| `End -> Code_error.raise "next_valid_utf8_uchar: `End" []
in
go ()
;;

let quote_length s =
let n = ref 0 in
let len = String.length s in
for i = 0 to len - 1 do
n
:= !n
+
match String.unsafe_get s i with
| '\"' | '\\' | '\n' | '\t' | '\r' | '\b' -> 2
| '%' -> if i + 1 < len && s.[i + 1] = '{' then 2 else 1
| ' ' .. '~' -> 1
| _ -> 4
let i = ref 0 in
while !i < len do
(n
:= !n
+
match String.unsafe_get s !i with
| '\"' | '\\' | '\n' | '\t' | '\r' | '\b' -> 2
| '%' -> if !i + 1 < len && s.[!i + 1] = '{' then 2 else 1
| ' ' .. '~' -> 1
| _ ->
(match next_valid_utf8_uchar_len s !i with
| Some uchar_len ->
i := !i + uchar_len - 1;
uchar_len
| None -> 4));
incr i
done;
!n
;;

let escape_to s ~dst:s' ~ofs =
let n = ref ofs in
let len = String.length s in
for i = 0 to len - 1 do
(match String.unsafe_get s i with
let i = ref 0 in
while !i < len do
(match String.unsafe_get s !i with
| ('\"' | '\\') as c ->
Bytes.unsafe_set s' !n '\\';
incr n;
Expand All @@ -41,21 +79,33 @@ let escape_to s ~dst:s' ~ofs =
Bytes.unsafe_set s' !n '\\';
incr n;
Bytes.unsafe_set s' !n 'b'
| '%' when i + 1 < len && s.[i + 1] = '{' ->
| '%' when !i + 1 < len && s.[!i + 1] = '{' ->
Bytes.unsafe_set s' !n '\\';
incr n;
Bytes.unsafe_set s' !n '%'
| ' ' .. '~' as c -> Bytes.unsafe_set s' !n c
| c ->
let a = Char.code c in
Bytes.unsafe_set s' !n '\\';
incr n;
Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a / 100)));
incr n;
Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a / 10 mod 10)));
incr n;
Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a mod 10))));
incr n
(match next_valid_utf8_uchar_len s !i with
| Some uchar_len ->
Bytes.unsafe_set s' !n (String.unsafe_get s !i);
Bytes.unsafe_set s' (!n + 1) (String.unsafe_get s (!i + 1));
if uchar_len > 2
then Bytes.unsafe_set s' (!n + 2) (String.unsafe_get s (!i + 2));
if uchar_len > 3
then Bytes.unsafe_set s' (!n + 3) (String.unsafe_get s (!i + 3));
n := !n + uchar_len - 1;
i := !i + uchar_len - 1
| None ->
let a = Char.code c in
Bytes.unsafe_set s' !n '\\';
incr n;
Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a / 100)));
incr n;
Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a / 10 mod 10)));
incr n;
Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a mod 10)))));
incr n;
incr i
done
;;

Expand Down
28 changes: 25 additions & 3 deletions test/blackbox-tests/test-cases/formatting/non-ascii-characters.t
Original file line number Diff line number Diff line change
@@ -1,13 +1,35 @@
How the non-ASCII characters are handled, this is also related to the issue #9728
Utf8 characters are handled for now, this is also related to the issue #9728

$ dune format-dune-file <<EOF
> ("É")
> ("Éff ĎúÑȨ")
> EOF
("\195\137")
("Éff ĎúÑȨ")

$ dune format-dune-file <<EOF
> (run foo %{bin:é})
> EOF
File "", line 1, characters 15-16:
Error: The character '\195' is not allowed inside %{...} forms
[1]

$ dune format-dune-file <<EOF
> (echo "hÉllo")
> EOF
(echo "hÉllo")

$ dune format-dune-file <<EOF
> (echo "É")
> EOF
(echo "É")

$ dune format-dune-file <<EOF
> (Écho "hello")
> EOF
File "", line 1, characters 1-1:
Error: Invalid . file
[1]
emillon marked this conversation as resolved.
Show resolved Hide resolved

$ bash -c "printf '(echo \"%b\")' '\xc0'"| dune format-dune-file
(echo "\192")
$ bash -c "printf '(echo \"%b\")' '\xf0'"| dune format-dune-file
(echo "\240")
Loading