From c94996859a116a24cdaaaf3b88d9200b3b72223a Mon Sep 17 00:00:00 2001 From: Jose Colella Date: Wed, 8 May 2024 12:01:59 -0400 Subject: [PATCH 1/5] test: Get basic test case for scrubber --- test/integration/test_scrubbers.rb | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/test/integration/test_scrubbers.rb b/test/integration/test_scrubbers.rb index d7665be..6e8e31d 100644 --- a/test/integration/test_scrubbers.rb +++ b/test/integration/test_scrubbers.rb @@ -50,6 +50,9 @@ class IntegrationTestScrubbers < Loofah::TestCase ENTITY_HACK_ATTACK_TEXT_SCRUB = "Hack attack!<script>alert('evil')</script>" ENTITY_HACK_ATTACK_TEXT_SCRUB_UNESC = "Hack attack!" + BREAKPOINT_FRAGMENT = "

Some text here in a logical paragraph.

Some more text, apparently a second paragraph.

Et cetera...

" + BREAKPOINT_RESULT = "

Some text here in a logical paragraph.

Some more text, apparently a second paragraph.

Et cetera...

" + context "scrubbing shortcuts" do context "#scrub_document" do it "is a shortcut for parse-and-scrub" do @@ -236,6 +239,16 @@ def html5? assert_equal doc, result end end + + context ":double_breakpoint" do + it "replaces double line breaks with paragraph tags" do + doc = klass.parse("#{BREAKPOINT_FRAGMENT}") + result = doc.scrub!(:double_breakpoint) + + assert_equal BREAKPOINT_RESULT, doc.xpath("/html/body").inner_html + assert_equal doc, result + end + end end context "#text" do From be6812478a1f0fb64cb6c55fdbd42163f653334a Mon Sep 17 00:00:00 2001 From: Jose Colella Date: Wed, 8 May 2024 12:23:17 -0400 Subject: [PATCH 2/5] initial scaffold --- lib/loofah/scrubbers.rb | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/lib/loofah/scrubbers.rb b/lib/loofah/scrubbers.rb index 63d3aec..db84131 100644 --- a/lib/loofah/scrubbers.rb +++ b/lib/loofah/scrubbers.rb @@ -350,6 +350,24 @@ def scrub(node) end end + # + # === scrub!(:double_breakpoint) + # + # +:double_breakpoint+ replaces double-break tags with closing/opening paragraph tags. + # + # double_breakpoint_markup = "

Some text here in a logical paragraph.

Some more text, apparently a second paragraph.

" + # Loofah.html5_fragment(messy_markup).scrub!(:double_breakpoint) + # => "

Some text here in a logical paragraph.

Some more text, apparently a second paragraph.

" + # + class DoubleBreakpoint < Scrubber + def initialize # rubocop:disable Lint/MissingSuper + @direction = :top_down + end + + def scrub(node) + + end + end # # A hash that maps a symbol (like +:prune+) to the appropriate Scrubber (Loofah::Scrubbers::Prune). # @@ -364,6 +382,7 @@ def scrub(node) targetblank: TargetBlank, newline_block_elements: NewlineBlockElements, unprintable: Unprintable, + double_breakpoint: DoubleBreakpoint, } class << self From 9da4c19611f4eb96326cedbdc0300ea1b705c3d7 Mon Sep 17 00:00:00 2001 From: Jose Colella Date: Wed, 8 May 2024 14:55:40 -0400 Subject: [PATCH 3/5] feat: Add implementation for method --- lib/loofah/scrubbers.rb | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/lib/loofah/scrubbers.rb b/lib/loofah/scrubbers.rb index db84131..eb68546 100644 --- a/lib/loofah/scrubbers.rb +++ b/lib/loofah/scrubbers.rb @@ -365,7 +365,40 @@ def initialize # rubocop:disable Lint/MissingSuper end def scrub(node) - + return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "p") + + paragraph_with_break_point_nodes = node.xpath("//p[br[following-sibling::br]]") + + paragraph_with_break_point_nodes.each do |paragraph_node| + new_paragraph = paragraph_node.add_previous_sibling("

").first + + paragraph_node.children.each do |child| + remove_blank_text_nodes(child) + end + + paragraph_node.children.each do |child| + # already unlinked + next if child.parent.nil? + + if child.name == "br" && child.next_sibling.name == "br" + new_paragraph = paragraph_node.add_previous_sibling("

").first + child.next_sibling.unlink + child.unlink + else + child.parent = new_paragraph + end + end + + paragraph_node.unlink + end + + CONTINUE + end + + private + + def remove_blank_text_nodes(node) + node.unlink if node.text? && node.blank? end end # From 4b339a5580e745bc33b51f41f6963068960fa51d Mon Sep 17 00:00:00 2001 From: Jose Colella Date: Wed, 8 May 2024 14:59:56 -0400 Subject: [PATCH 4/5] docs: Update documentation --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 11b76c9..69fb7b9 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,7 @@ Active Record extensions for HTML sanitization are available in the [`loofah-act * Add the _nofollow_ attribute to all hyperlinks. * Add the _target=\_blank_ attribute to all hyperlinks. * Remove _unprintable_ characters from text nodes. + * Modify _double breakpoints_ characters to paragraph nodes. * Format markup as plain text, with (or without) sensible whitespace handling around block elements. * Replace Rails's `strip_tags` and `sanitize` view helper methods. @@ -235,6 +236,7 @@ doc.scrub!(:noopener) # adds rel="noopener" attribute to links doc.scrub!(:noreferrer) # adds rel="noreferrer" attribute to links doc.scrub!(:unprintable) # removes unprintable characters from text nodes doc.scrub!(:targetblank) # adds target="_blank" attribute to links +doc.scrub!(:double_breakpoint) # removes double breakpoints to paragraph nodes ``` See `Loofah::Scrubbers` for more details and example usage. From 4d94183d64b947d3dccbaea0c6074b6402e857a6 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Tue, 31 Dec 2024 17:08:48 -0500 Subject: [PATCH 5/5] Drop newlines from the double-breakpoint test because the html4 and html5 parsers just handle tags and insert newlines differently, and their presence/absence is orthogonal to wrapping in a `p` tag. --- test/integration/test_scrubbers.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/integration/test_scrubbers.rb b/test/integration/test_scrubbers.rb index 6e8e31d..37b8d81 100644 --- a/test/integration/test_scrubbers.rb +++ b/test/integration/test_scrubbers.rb @@ -245,7 +245,7 @@ def html5? doc = klass.parse("#{BREAKPOINT_FRAGMENT}") result = doc.scrub!(:double_breakpoint) - assert_equal BREAKPOINT_RESULT, doc.xpath("/html/body").inner_html + assert_equal BREAKPOINT_RESULT, doc.xpath("/html/body").inner_html.delete("\n") assert_equal doc, result end end