diff --git a/README.md b/README.md index 11b76c9..69fb7b9 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,7 @@ Active Record extensions for HTML sanitization are available in the [`loofah-act * Add the _nofollow_ attribute to all hyperlinks. * Add the _target=\_blank_ attribute to all hyperlinks. * Remove _unprintable_ characters from text nodes. + * Modify _double breakpoints_ characters to paragraph nodes. * Format markup as plain text, with (or without) sensible whitespace handling around block elements. * Replace Rails's `strip_tags` and `sanitize` view helper methods. @@ -235,6 +236,7 @@ doc.scrub!(:noopener) # adds rel="noopener" attribute to links doc.scrub!(:noreferrer) # adds rel="noreferrer" attribute to links doc.scrub!(:unprintable) # removes unprintable characters from text nodes doc.scrub!(:targetblank) # adds target="_blank" attribute to links +doc.scrub!(:double_breakpoint) # removes double breakpoints to paragraph nodes ``` See `Loofah::Scrubbers` for more details and example usage. diff --git a/lib/loofah/scrubbers.rb b/lib/loofah/scrubbers.rb index 63d3aec..eb68546 100644 --- a/lib/loofah/scrubbers.rb +++ b/lib/loofah/scrubbers.rb @@ -350,6 +350,57 @@ def scrub(node) end end + # + # === scrub!(:double_breakpoint) + # + # +:double_breakpoint+ replaces double-break tags with closing/opening paragraph tags. + # + # double_breakpoint_markup = "

Some text here in a logical paragraph.

Some more text, apparently a second paragraph.

" + # Loofah.html5_fragment(messy_markup).scrub!(:double_breakpoint) + # => "

Some text here in a logical paragraph.

Some more text, apparently a second paragraph.

" + # + class DoubleBreakpoint < Scrubber + def initialize # rubocop:disable Lint/MissingSuper + @direction = :top_down + end + + def scrub(node) + return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "p") + + paragraph_with_break_point_nodes = node.xpath("//p[br[following-sibling::br]]") + + paragraph_with_break_point_nodes.each do |paragraph_node| + new_paragraph = paragraph_node.add_previous_sibling("

").first + + paragraph_node.children.each do |child| + remove_blank_text_nodes(child) + end + + paragraph_node.children.each do |child| + # already unlinked + next if child.parent.nil? + + if child.name == "br" && child.next_sibling.name == "br" + new_paragraph = paragraph_node.add_previous_sibling("

").first + child.next_sibling.unlink + child.unlink + else + child.parent = new_paragraph + end + end + + paragraph_node.unlink + end + + CONTINUE + end + + private + + def remove_blank_text_nodes(node) + node.unlink if node.text? && node.blank? + end + end # # A hash that maps a symbol (like +:prune+) to the appropriate Scrubber (Loofah::Scrubbers::Prune). # @@ -364,6 +415,7 @@ def scrub(node) targetblank: TargetBlank, newline_block_elements: NewlineBlockElements, unprintable: Unprintable, + double_breakpoint: DoubleBreakpoint, } class << self diff --git a/test/integration/test_scrubbers.rb b/test/integration/test_scrubbers.rb index d7665be..37b8d81 100644 --- a/test/integration/test_scrubbers.rb +++ b/test/integration/test_scrubbers.rb @@ -50,6 +50,9 @@ class IntegrationTestScrubbers < Loofah::TestCase ENTITY_HACK_ATTACK_TEXT_SCRUB = "Hack attack!<script>alert('evil')</script>" ENTITY_HACK_ATTACK_TEXT_SCRUB_UNESC = "Hack attack!" + BREAKPOINT_FRAGMENT = "

Some text here in a logical paragraph.

Some more text, apparently a second paragraph.

Et cetera...

" + BREAKPOINT_RESULT = "

Some text here in a logical paragraph.

Some more text, apparently a second paragraph.

Et cetera...

" + context "scrubbing shortcuts" do context "#scrub_document" do it "is a shortcut for parse-and-scrub" do @@ -236,6 +239,16 @@ def html5? assert_equal doc, result end end + + context ":double_breakpoint" do + it "replaces double line breaks with paragraph tags" do + doc = klass.parse("#{BREAKPOINT_FRAGMENT}") + result = doc.scrub!(:double_breakpoint) + + assert_equal BREAKPOINT_RESULT, doc.xpath("/html/body").inner_html.delete("\n") + assert_equal doc, result + end + end end context "#text" do