Skip to content

Commit

Permalink
rc utf-8 implementation and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Moosieus committed Oct 15, 2023
1 parent 888c769 commit 79f9f80
Show file tree
Hide file tree
Showing 7 changed files with 59 additions and 37 deletions.
4 changes: 2 additions & 2 deletions lib/unicode/validation/utf16.ex
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ defmodule Unicode.Validation.UTF16 do
end

# good sequence
defp do_replace(<<_::utf16, rest::binary>>, rep, acc) do
defp do_replace(<<_::utf16, rest::bits>>, rep, acc) do
do_replace(rest, rep, acc)
end

# illegal sequence
defp do_replace(<<_::binary-size(2), rest::binary>>, rep, acc) do
defp do_replace(<<_::bytes-size(2), rest::bits>>, rep, acc) do
do_replace(rest, rep, <<acc::bits, rep::bits>>)
end
end
4 changes: 2 additions & 2 deletions lib/unicode/validation/utf16le.ex
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ defmodule Unicode.Validation.UTF16LE do
end

# good sequence
defp do_replace(<<_::utf16-little, rest::binary>>, rep, acc) do
defp do_replace(<<_::utf16-little, rest::bits>>, rep, acc) do
do_replace(rest, rep, acc)
end

# illegal sequence
defp do_replace(<<_::binary-size(2), rest::binary>>, rep, acc) do
defp do_replace(<<_::bytes-size(2), rest::bits>>, rep, acc) do
do_replace(rest, rep, <<acc::bits, rep::bits>>)
end
end
4 changes: 2 additions & 2 deletions lib/unicode/validation/utf32.ex
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ defmodule Unicode.Validation.UTF32 do
end

# good sequence
defp do_replace(<<_::utf32, rest::binary>>, rep, acc) do
defp do_replace(<<_::utf32, rest::bits>>, rep, acc) do
do_replace(rest, rep, acc)
end

# illegal sequence
defp do_replace(<<_::binary-size(4), rest::binary>>, rep, acc) do
defp do_replace(<<_::bytes-size(4), rest::bits>>, rep, acc) do
do_replace(rest, rep, <<acc::bits, rep::bits>>)
end
end
4 changes: 2 additions & 2 deletions lib/unicode/validation/utf32le.ex
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ defmodule Unicode.Validation.UTF32LE do
end

# good sequence
defp do_replace(<<_::utf32-little, rest::binary>>, rep, acc) do
defp do_replace(<<_::utf32-little, rest::bits>>, rep, acc) do
do_replace(rest, rep, acc)
end

# illegal sequence
defp do_replace(<<_::binary-size(4), rest::binary>>, rep, acc) do
defp do_replace(<<_::bytes-size(4), rest::bits>>, rep, acc) do
do_replace(rest, rep, <<acc::bits, rep::bits>>)
end
end
35 changes: 20 additions & 15 deletions lib/unicode/validation/utf8.ex
Original file line number Diff line number Diff line change
Expand Up @@ -5,57 +5,62 @@ defmodule Unicode.Validation.UTF8 do
do_replace(bytes, replacement, <<>>)
end

# match ascii characters first for speed
defp do_replace(<<ascii::8, n_lead::2, rest::bytes>>, rep, acc) when ascii in 0..127 and n_lead != 0b10 do
do_replace(rest, rep, <<acc::bits, ascii::8, n_lead::2>>)
# ASCII (for better average speed)
defp do_replace(<<ascii::8, n_lead::2, rest::bits>>, rep, acc) when ascii in 0..127 and n_lead != 0b10 do
do_replace(<<n_lead::2, rest::bits>>, rep, <<acc::bits, ascii::8>>)
end

defp do_replace(<<grapheme::utf8, rest::bytes>>, rep, acc) do
# UTF-8 (valid)
defp do_replace(<<grapheme::utf8, rest::bits>>, rep, acc) do
do_replace(rest, rep, <<acc::bits, grapheme::utf8>>)
end

# 2/3-byte truncated
# 2/3 truncated sequence

defp do_replace(<<0b1110::4, i::4, 0b10::2, ii::6>>, rep, acc) do
<<tcp::10>> = <<i::4, ii::6>>

<<acc::bits, ii_of_iii(tcp, rep)::bits>>
end

defp do_replace(<<0b1110::4, i::4, 0b10::2, ii::6, n_lead::2, rest::bits>>, rep, acc) when n_lead != 0b10 do
<<tcp::10>> = <<i::4, ii::6>>

do_replace(<<n_lead::2, rest::bits>>, rep, <<acc::bits, ii_of_iii(tcp, rep)::bits>>)
end

# 2/4-byte truncated
# 2/4

defp do_replace(<<0b11110::5, i::3, 0b10::2, ii::6>>, rep, acc) do
<<tcp::10>> = <<i::4, ii::6>>

<<acc::bits, ii_of_iiii(tcp, rep)::bits>>
end

defp do_replace(<<0b11110::5, i::3, 0b10::2, ii::6, n_lead::2, rest::bits>>, rep, acc) when n_lead != 0b10 do
<<tcp::10>> = <<i::4, ii::6>>

do_replace(<<n_lead::2, rest::bits>>, rep, <<acc::bits, ii_of_iiii(tcp, rep)::bits>>)
end

# 3/4-byte truncated
# 3/4

defp do_replace(<<0b11110::5, i::3, 0b10::2, ii::6, 0b10::2, iii::6>>, rep, acc) do
<<tcp::15>> = <<i::3, ii::6, iii::6>>

<<acc::bits, iii_of__iiii(tcp, rep)::bits>>
end

defp do_replace(<<0b11110::5, i::3, 0b10::2, ii::6, 0b10::2, iii::6, n_lead::2, rest::bytes>>, rep, acc) when n_lead != 0b10 do
defp do_replace(<<0b11110::5, i::3, 0b10::2, ii::6, 0b10::2, iii::6, n_lead::2, rest::bits>>, rep, acc) when n_lead != 0b10 do
<<tcp::15>> = <<i::3, ii::6, iii::6>>

do_replace(<<n_lead::2, rest::bits>>, rep, <<acc::bits, iii_of__iiii(tcp, rep)::bits>>)
end

defp do_replace(<<_, rest::bytes>>, rep, acc), do: do_replace(rest, rep, <<acc::bits, rep::bytes>>)
# Everything else

defp do_replace(<<_, rest::bits>>, rep, acc), do: do_replace(rest, rep, <<acc::bits, rep::bits>>)

# Final

defp do_replace(<<>>, _, acc), do: acc

# bounds-checking truncated code points for overlong encodings

defp ii_of_iii(tcp, rep) when tcp >= 32 and tcp <= 863, do: rep
defp ii_of_iii(tcp, rep) when tcp >= 896 and tcp <= 1023, do: rep
defp ii_of_iii(_, rep), do: rep <> rep
Expand Down
File renamed without changes.
45 changes: 31 additions & 14 deletions test/unicode_validation_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -3,42 +3,59 @@ defmodule Unicode.Validation.UTF8.Test do

alias Unicode.Validation.UTF8.Test.Helpers, as: Helpers

# Todo:
# Produce randomized binaries implementations can be run against.
# Implement a way to save interesting or breaking tests.
# Also add a few sanity-check static tests.
# Todo/Nice to haves:
# Save interesting or breaking tests.
# More cases

test "single valid sequence" do
assert Unicode.replace_invalid("é", :utf8) === "é"
end

test "valid sequences" do
Enum.map(0..10_000, fn _ ->
char = Helpers.random_valid_sequence()

assert Unicode.replace_invalid(char, :utf8) === char
end)
end

test "single truncated sequence" do
{overlong_e, replacement} = Helpers.truncate("é")

assert Unicode.replace_invalid(overlong_e, :utf8) === replacement
end

test "truncated sequences" do
Enum.map(0..10_000, fn _ ->
{char, rep} = Helpers.random_truncated()

assert Unicode.replace_invalid(char, :utf8) === rep
end)
end

test "single overlong sequence" do
{overlong_e, replacement} = Helpers.overlong("é", 3)

assert Unicode.replace_invalid(overlong_e, :utf8) === replacement
end

test "clean multilingual hello world json" do
# https://github.com/novellac/multilanguage-hello-json/tree/master
j = File.read!("test/hello.json")
test "overlong sequences" do
Enum.map(0..10_000, fn _ ->
{char, rep} = Helpers.random_overlong()

assert j === Unicode.replace_invalid(j, :utf8)
assert Unicode.replace_invalid(char, :utf8) === rep
end)
end

test "randomly generated illegal binary" do
# generate two binary strings, one valid and one invalid
test "clean multilingual hello world json" do
j = File.read!("test/hello.json") # https://github.com/novellac/multilanguage-hello-json/tree/master

orig = <<>>
final = <<>>
assert j === Unicode.replace_invalid(j, :utf8)
end

File.write!("test/last_valid.bin")
test "randomly generated illegal binary stress test" do
{invalid, correct} = Helpers.random_sequences(100_000)

assert true === true
assert Unicode.replace_invalid(invalid, :utf8) === correct
end
end

0 comments on commit 79f9f80

Please sign in to comment.