Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[draft] default to html5 parsing #239

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,29 @@ jobs:
bundler-cache: true
- run: bundle exec rake rubocop

test:
test_html4:
env:
LOOFAH_HTML4_MODE: "t"
needs: ["rubocop"]
strategy:
fail-fast: false
matrix:
ruby: ["2.5", "2.6", "2.7", "3.0", "3.1", "head", "truffleruby-head"]
ruby: ["2.6", "2.7", "3.0", "3.1", "head", "truffleruby-head"]
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: ruby/setup-ruby@v1
with:
ruby-version: ${{matrix.ruby}}
bundler-cache: true
- run: bundle exec rake test

test_html5:
needs: ["rubocop"]
strategy:
fail-fast: false
matrix:
ruby: ["2.6", "2.7", "3.0", "3.1", "head", "truffleruby-head"]
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
Expand Down
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
source "https://rubygems.org/"
gemspec
gem "nokogiri", git: "https://github.com/sparklemotion/nokogiri" # main has subclassing fix
56 changes: 38 additions & 18 deletions lib/loofah.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,6 @@
require "nokogiri"

require_relative "loofah/version"
require_relative "loofah/metahelpers"
require_relative "loofah/elements"

require_relative "loofah/html5/safelist"
require_relative "loofah/html5/libxml2_workarounds"
require_relative "loofah/html5/scrub"

require_relative "loofah/scrubber"
require_relative "loofah/scrubbers"

require_relative "loofah/instance_methods"
require_relative "loofah/xml/document"
require_relative "loofah/xml/document_fragment"
require_relative "loofah/html/document"
require_relative "loofah/html/document_fragment"

# == Strings and IO Objects as Input
#
Expand All @@ -31,13 +16,13 @@
module Loofah
class << self
# Shortcut for Loofah::HTML::Document.parse
# This method accepts the same parameters as Nokogiri::HTML::Document.parse
# This method accepts the same parameters as Nokogiri::HTML5::Document.parse
def document(*args, &block)
remove_comments_before_html_element Loofah::HTML::Document.parse(*args, &block)
remove_comments_before_html_element(Loofah::HTML::Document.parse(*args, &block))
end

# Shortcut for Loofah::HTML::DocumentFragment.parse
# This method accepts the same parameters as Nokogiri::HTML::DocumentFragment.parse
# This method accepts the same parameters as Nokogiri::HTML5::DocumentFragment.parse
def fragment(*args, &block)
Loofah::HTML::DocumentFragment.parse(*args, &block)
end
Expand Down Expand Up @@ -79,6 +64,25 @@ def remove_extraneous_whitespace(string)
string.gsub(/\n\s*\n\s*\n/, "\n\n")
end

def html5_mode?
defined?(::Nokogiri::HTML5) && (parser_module == ::Nokogiri::HTML5)
end

def parser_module(class_name=nil)
@parser_module ||= begin
if ENV["LOOFAH_HTML4_MODE"].nil? && defined?(::Nokogiri::HTML5)
::Nokogiri::HTML5
else
::Nokogiri::HTML4
end
end
if class_name
@parser_module.const_get(class_name)
else
@parser_module
end
end

private

# remove comments that exist outside of the HTML element.
Expand All @@ -98,3 +102,19 @@ def remove_comments_before_html_element(doc)
end
end
end

require_relative "loofah/metahelpers"
require_relative "loofah/elements"

require_relative "loofah/html5/safelist"
require_relative "loofah/html5/libxml2_workarounds"
require_relative "loofah/html5/scrub"

require_relative "loofah/scrubber"
require_relative "loofah/scrubbers"

require_relative "loofah/instance_methods"
require_relative "loofah/xml/document"
require_relative "loofah/xml/document_fragment"
require_relative "loofah/html/document"
require_relative "loofah/html/document_fragment"
4 changes: 2 additions & 2 deletions lib/loofah/html/document.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
module Loofah
module HTML # :nodoc:
#
# Subclass of Nokogiri::HTML::Document.
# Subclass of Nokogiri::HTML5::Document.
#
# See Loofah::ScrubBehavior and Loofah::TextBehavior for additional methods.
#
class Document < Nokogiri::HTML::Document
class Document < ::Loofah.parser_module(:Document)
include Loofah::ScrubBehavior::Node
include Loofah::DocumentDecorator
include Loofah::TextBehavior
Expand Down
6 changes: 3 additions & 3 deletions lib/loofah/html/document_fragment.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@
module Loofah
module HTML # :nodoc:
#
# Subclass of Nokogiri::HTML::DocumentFragment.
# Subclass of Nokogiri::HTML5::DocumentFragment.
#
# See Loofah::ScrubBehavior and Loofah::TextBehavior for additional methods.
#
class DocumentFragment < Nokogiri::HTML::DocumentFragment
class DocumentFragment < ::Loofah.parser_module(:DocumentFragment)
include Loofah::TextBehavior

class << self
#
# Overridden Nokogiri::HTML::DocumentFragment
# Overridden Nokogiri::HTML5::DocumentFragment
# constructor. Applications should use Loofah.fragment to
# parse a fragment.
#
Expand Down
3 changes: 3 additions & 0 deletions lib/loofah/html5/safelist.rb
Original file line number Diff line number Diff line change
Expand Up @@ -869,6 +869,9 @@ module SafeList
"img",
"input",
])
if ::Loofah.html5_mode?
VOID_ELEMENTS.add("wbr")
end

# additional tags we should consider safe since we have libxml2 fixing up our documents.
TAGS_SAFE_WITH_LIBXML2 = Set.new([
Expand Down
2 changes: 1 addition & 1 deletion lib/loofah/scrubber.rb
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,8 @@ def append_attribute(node, attribute, value)
def html5lib_sanitize(node)
case node.type
when Nokogiri::XML::Node::ELEMENT_NODE
HTML5::Scrub.scrub_attributes node
if HTML5::Scrub.allowed_element? node.name
HTML5::Scrub.scrub_attributes node
return Scrubber::CONTINUE
end
when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
Expand Down
82 changes: 52 additions & 30 deletions test/assets/testdata_sanitizer_tests1.dat
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
{
"name": "IE_Comments_2",
"input": "<![if !IE 5]><script>alert('XSS');</script><![endif]>",
"output": "&lt;script&gt;alert('XSS');&lt;/script&gt;",
"xhtml": "&lt;![if !IE 5]&gt;&lt;script&gt;alert('XSS');&lt;/script&gt;&lt;![endif]&gt;",
"commentary": "output is libxml 2.9.13 and earlier, xhtml is libxml 2.9.14 and later, see libxml 148be64"
"libxml_lte_2.9.13": "&lt;script&gt;alert('XSS');&lt;/script&gt;",
"libxml_gte_2.9.14": "&lt;![if !IE 5]&gt;&lt;script&gt;alert('XSS');&lt;/script&gt;&lt;![endif]&gt;",
"libgumbo": "&lt;!--[if !IE 5]--&gt;&lt;script&gt;alert('XSS');&lt;/script&gt;&lt;!--[endif]--&gt;"
},

{
Expand All @@ -30,8 +30,8 @@
{
"name": "bgsound",
"input": "<bgsound src=\"javascript:alert('XSS');\" />",
"output": "&lt;bgsound src=\"javascript:alert('XSS');\"/&gt;",
"rexml": "&lt;bgsound src=\"javascript:alert('XSS');\"&gt;&lt;/bgsound&gt;"
"libxml": "&lt;bgsound&gt;&lt;/bgsound&gt;",
"libgumbo": "&lt;bgsound&gt;"
},

{
Expand Down Expand Up @@ -85,15 +85,27 @@
{
"name": "double_open_angle_brackets",
"input": "<img src=http://ha.ckers.org/scriptlet.html <",
"output": "<img src='http://ha.ckers.org/scriptlet.html'>",
"rexml": "Ill-formed XHTML!"
"libxml": "<img src='http://ha.ckers.org/scriptlet.html'>",
"libgumbo": "" /* it is indeed the empty result, see next test for a better test */
},

{
"name": "double_open_angle_brackets v2",
"input": "<div><img src=http://ha.ckers.org/scriptlet.html < </div>",
"output": "<div><img src='http://ha.ckers.org/scriptlet.html'></div>"
},

{
"name": "double_open_angle_brackets_2",
"input": "<script src=http://ha.ckers.org/scriptlet.html <",
"output": "&lt;script src=\"http://ha.ckers.org/scriptlet.html\"&gt;&lt;/script&gt;",
"rexml": "Ill-formed XHTML!"
"libxml": "&lt;script src=\"http://ha.ckers.org/scriptlet.html\"&gt;&lt;/script&gt;",
"libgumbo": "" /* it is indeed empty */
},

{
"name": "double_open_angle_brackets_2 v2",
"input": "<div><script src=http://ha.ckers.org/scriptlet.html < </div>",
"libxml": "<div>&lt;script src=\"http://ha.ckers.org/scriptlet.html\"&gt;&lt;/script&gt;</div>"
},

{
Expand Down Expand Up @@ -127,15 +139,13 @@
{
"name": "link_stylesheets",
"input": "<link rel=\"stylesheet\" href=\"javascript:alert('XSS');\" />",
"output": "&lt;link rel=\"stylesheet\" href=\"javascript:alert('XSS');\"&gt;",
"rexml": "&lt;link href=\"javascript:alert('XSS');\" rel=\"stylesheet\"/&gt;"
"output": "&lt;link rel=\"stylesheet\"&gt;"
},

{
"name": "link_stylesheets_2",
"input": "<link rel=\"stylesheet\" href=\"http://ha.ckers.org/xss.css\" />",
"output": "&lt;link rel=\"stylesheet\" href=\"http://ha.ckers.org/xss.css\"&gt;",
"rexml": "&lt;link href=\"http://ha.ckers.org/xss.css\" rel=\"stylesheet\"/&gt;"
"output": "&lt;link rel=\"stylesheet\" href=\"http://ha.ckers.org/xss.css\"&gt;"
},

{
Expand All @@ -147,8 +157,8 @@
{
"name": "no_closing_script_tags",
"input": "<script src=http://ha.ckers.org/xss.js?<b>",
"output": "&lt;script src=\"http://ha.ckers.org/xss.js?&amp;lt;b\"&gt;&lt;/script&gt;",
"rexml": "Ill-formed XHTML!"
"libxml": "&lt;script src=\"http://ha.ckers.org/xss.js?&amp;lt;b\"&gt;&lt;/script&gt;",
"libgumbo": "&lt;script src='http://ha.ckers.org/xss.js?&lt;b'&gt;&lt;/script&gt;"
},

{
Expand All @@ -168,8 +178,8 @@
{
"name": "non_alpha_non_digit_3",
"input": "<img/src=\"http://ha.ckers.org/xss.js\"/>",
"output": "<img>",
"rexml": "Ill-formed XHTML!"
"libxml": "<img>",
"libgumbo": "<img src='http://ha.ckers.org/xss.js'>" /* see "should_allow_image_src_attribute" test */
},

{
Expand Down Expand Up @@ -367,8 +377,14 @@
{
"name": "should_sanitize_half_open_scripts",
"input": "<img src=\"javascript:alert('XSS')\"",
"output": "<img>",
"rexml": "Ill-formed XHTML!"
"libxml": "<img>",
"libgumbo": "" /* indeed it is empty */
},

{
"name": "should_sanitize_half_open_scripts 2",
"input": "<div><img src=\"javascript:alert('XSS')\" </div>",
"output": "<div><img></div>"
},

{
Expand All @@ -387,24 +403,30 @@
},

{
"name": "should_sanitize_script_tag_with_multiple_open_brackets_2",
"name": "should_sanitize_script_tag_with_multiple_open_brackets_2a",
"input": "<iframe src=http://ha.ckers.org/scriptlet.html\n<",
"output": "&lt;iframe src=\"http://ha.ckers.org/scriptlet.html\"&gt;&lt;/iframe&gt;",
"rexml": "Ill-formed XHTML!"
"libxml": "&lt;iframe src=\"http://ha.ckers.org/scriptlet.html\"&gt;&lt;/iframe&gt;",
"libgumbo": "" /* it is indeed empty, see next test */
},

{
"name": "should_sanitize_script_tag_with_multiple_open_brackets_2b",
"input": "<div><iframe src=http://ha.ckers.org/scriptlet.html\n< </div>",
"libxml": "<div>&lt;iframe src=\"http://ha.ckers.org/scriptlet.html\"&gt;&lt;/iframe&gt;</div>"
},

{
"name": "should_sanitize_tag_broken_up_by_null",
"input": "<scr\u0000ipt>alert(\"XSS\")</scr\u0000ipt>",
"output": "&lt;scr&gt;&lt;/scr&gt;",
"rexml": "Ill-formed XHTML!"
"libxml": "&lt;scr&gt;&lt;/scr&gt;",
"libgumbo": "&lt;scr�ipt&gt;alert('XSS')&lt;/scr�ipt&gt;"
},

{
"name": "should_sanitize_unclosed_script",
"input": "<script src=http://ha.ckers.org/xss.js?<b>",
"output": "&lt;script src=\"http://ha.ckers.org/xss.js?&amp;lt;b\"&gt;&lt;/script&gt;",
"rexml": "Ill-formed XHTML!"
"libxml": "&lt;script src=\"http://ha.ckers.org/xss.js?&amp;lt;b\"&gt;&lt;/script&gt;",
"libgumbo": "&lt;script src='http://ha.ckers.org/xss.js?&lt;b'&gt;&lt;/script&gt;"
},

{
Expand Down Expand Up @@ -448,8 +470,8 @@
{
"name": "quotes_in_attributes",
"input": "<img src='foo' title='\"foo\" bar' />",
"rexml": "<img src='foo' title='\"foo\" bar' />",
"output": "<img src='foo' title='\"foo\" bar'>"
"libxml": "<img src='foo' title='\"foo\" bar'>",
"libgumbo": "<img src='foo' title='&quot;foo&quot; bar'>"
},

{
Expand Down Expand Up @@ -487,8 +509,8 @@
{
"name": "allow_html5_image_tag",
"input": "<image src='foo' />",
"rexml": "&lt;image src=\"foo\"&gt;&lt;/image&gt;",
"output": "&lt;image src=\"foo\"/&gt;"
"libxml": "&lt;image src=\"foo\"&gt;&lt;/image&gt;",
"libgumbo": "<img src='foo'>"
},

{
Expand Down
5 changes: 5 additions & 0 deletions test/helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,14 @@
require File.expand_path(File.join(File.dirname(__FILE__), "..", "lib", "loofah", "helpers"))

puts "=> testing with Nokogiri #{Nokogiri::VERSION_INFO.inspect}"
puts "=> parser module is #{::Loofah::parser_module}"

class Loofah::TestCase < MiniTest::Spec
class << self
alias_method :context, :describe
end

def html5_mode?
::Loofah.html5_mode?
end
end
Loading