Skip to content

Commit

Permalink
use a much more advanced whitespace escaping algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
Christian Kruse committed Jan 14, 2024
1 parent 10f117f commit 4e2e902
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 22 deletions.
72 changes: 52 additions & 20 deletions lib/microformats2.ex
Original file line number Diff line number Diff line change
Expand Up @@ -107,34 +107,66 @@ defmodule Microformats2 do
}
end

defp replace_whitespaces(text, last_text \\ "")
defp replace_whitespaces(text, last_text) when last_text == text, do: text
defp escape_whitespaces(<<>>, new_content, _, _, _, _), do: new_content

defp replace_whitespaces(text, _) do
text
|> String.replace(~r/>((&#32;)*) ( *)</, ">\\g{1}&#32;\\g{3}<")
|> replace_whitespaces(text)
defp escape_whitespaces(<<cp::utf8, rest::binary>>, new_content, in_tag, in_attr, end_quote, in_special) do
char = <<cp::utf8>>

cond do
# special tags: start
char == "<" && in_tag == false && String.match?(rest, ~r/\A(style|script|svg)[\s>]/) ->
escape_whitespaces(rest, new_content <> char, false, false, "", true)

char == "<" && in_special && String.match?(rest, ~r/\A\/(style|script|svg)[\s>]/) ->
escape_whitespaces(rest, new_content <> char, true, false, "", false)

in_special ->
escape_whitespaces(rest, new_content <> char, in_tag, in_attr, end_quote, in_special)

char == end_quote && in_tag && in_attr ->
escape_whitespaces(rest, new_content <> char, in_tag, false, "", in_special)

char in ["\"", "'"] && in_tag && !in_attr ->
escape_whitespaces(rest, new_content <> char, in_tag, true, char, in_special)

char in ["<", ">"] && in_attr ->
escape_whitespaces(rest, new_content <> char, in_tag, in_attr, end_quote, in_special)

# tag ends
char == ">" && in_tag == true && in_special == false ->
escape_whitespaces(rest, new_content <> char, false, false, "", false)

# tag starts
char == "<" && in_tag == false ->
escape_whitespaces(rest, new_content <> char, true, false, "", false)

# whitespaces
char == " " && in_tag == false ->
escape_whitespaces(rest, new_content <> "&#32;", in_tag, in_attr, end_quote, in_special)

char == "\n" && in_tag == false ->
escape_whitespaces(rest, new_content <> "&#x0A;", in_tag, in_attr, end_quote, in_special)

char == "\v" && in_tag == false ->
escape_whitespaces(rest, new_content <> "&#x0B;", in_tag, in_attr, end_quote, in_special)

char == "\r" && in_tag == false ->
escape_whitespaces(rest, new_content <> "&#x0D;", in_tag, in_attr, end_quote, in_special)

true ->
escape_whitespaces(rest, new_content <> char, in_tag, in_attr, end_quote, in_special)
end
end

# this is a really ugly hack, but html5ever doesn't support template tags (it fails with a nif_panic),
# mochiweb has bugs with whitespaces and I can't really get fast_html to work
defp parsed_document(content) do
content
|> replace_whitespaces()
|> String.replace(~r/\015/, "&#x0D;")
|> String.replace(~r/\012/, "&#x0A;")
|> String.replace(~r/\013/, "&#x0B;")
|> escape_whitespaces("", false, false, "", false)
|> Floki.parse_document()
|> normalize_tag_names()
end

defp normalize_tag_names({:ok, tree}) do
{:ok,
Floki.traverse_and_update(tree, fn
{tag, attrs, children} -> {String.trim(tag), attrs, children}
other -> other
end)}
end
# |> IO.inspect()

defp normalize_tag_names(other), do: other
# |> normalize_tag_names()
end
end
2 changes: 1 addition & 1 deletion test/items_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -663,7 +663,7 @@ defmodule Microformats2.ItemsTest do
"rel-urls" => %{
"http://localhost:9000/" => %{
"rels" => ["me"],
"text" => "\n \n Jacky Alciné\n \n "
"text" => "\n \n Jacky Alciné\n "
}
},
"rels" => %{"me" => ["http://localhost:9000/"]}
Expand Down
2 changes: 1 addition & 1 deletion test/nesting_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,11 @@ defmodule Microformats2.NestingTest do
"value" => "Zachary Dunn"
}
],
"category" => ["IndieWeb", "tech", "decentralization"],
"published" => ["2024-01-09 17:11:10Z"],
"repost-of" => [
%{
"properties" => %{
"category" => ["IndieWeb", "tech", "decentralization"],
"name" => ["gilest.org: Make the indie web easier"],
"url" => ["https://gilest.org/indie-easy.html"]
},
Expand Down

0 comments on commit 4e2e902

Please sign in to comment.