Skip to content

Commit

Permalink
fix: Prevent code injection due to improper escaping in libxml2 >= 2.9.2
Browse files Browse the repository at this point in the history
When Sanitize <= 4.6.2 is used in combination with libxml2 >= 2.9.2, a
specially crafted HTML fragment can cause libxml2 to generate improperly
escaped output, allowing non-whitelisted attributes to be used on
whitelisted elements.

Sanitize now performs additional escaping on affected attributes to
prevent this.

Many thanks to the Shopify Application Security Team for responsibly
reporting this issue.

Fixes #176
  • Loading branch information
rgrove committed Mar 20, 2018
1 parent 0eee92e commit 01629a1
Show file tree
Hide file tree
Showing 3 changed files with 149 additions and 20 deletions.
93 changes: 74 additions & 19 deletions lib/sanitize/transformers/clean_element.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,31 @@ class Sanitize; module Transformers; class CleanElement
# http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#embedding-custom-non-visible-data-with-the-data-*-attributes
REGEX_DATA_ATTR = /\Adata-(?!xml)[a-z_][\w.\u00E0-\u00F6\u00F8-\u017F\u01DD-\u02AF-]*\z/u

# Attributes that need additional escaping on `<a>` elements due to unsafe
# libxml2 behavior.
UNSAFE_LIBXML_ATTRS_A = Set.new(%w[
name
])

# Attributes that need additional escaping on all elements due to unsafe
# libxml2 behavior.
UNSAFE_LIBXML_ATTRS_GLOBAL = Set.new(%w[
action
href
src
])

# Mapping of original characters to escape sequences for characters that
# should be escaped in attributes affected by unsafe libxml2 behavior.
UNSAFE_LIBXML_ESCAPE_CHARS = {
' ' => '%20',
'"' => '%22'
}

# Regex that matches any single character that needs to be escaped in
# attributes affected by unsafe libxml2 behavior.
UNSAFE_LIBXML_ESCAPE_REGEX = /[ "]/

def initialize(config)
@add_attributes = config[:add_attributes]
@attributes = config[:attributes].dup
Expand Down Expand Up @@ -92,31 +117,61 @@ def call(env)
node.attribute_nodes.each do |attr|
attr_name = attr.name.downcase

if attr_whitelist.include?(attr_name)
# The attribute is whitelisted.
unless attr_whitelist.include?(attr_name)
# The attribute isn't whitelisted.

if allow_data_attributes && attr_name.start_with?('data-')
# Arbitrary data attributes are allowed. If this is a data
# attribute, continue.
next if attr_name =~ REGEX_DATA_ATTR
end

# Either the attribute isn't a data attribute or arbitrary data
# attributes aren't allowed. Remove the attribute.
attr.unlink
next
end

# The attribute is whitelisted.

# Remove any attributes that use unacceptable protocols.
if @protocols.include?(name) && @protocols[name].include?(attr_name)
attr_protocols = @protocols[name][attr_name]
# Remove any attributes that use unacceptable protocols.
if @protocols.include?(name) && @protocols[name].include?(attr_name)
attr_protocols = @protocols[name][attr_name]

if attr.value =~ REGEX_PROTOCOL
attr.unlink unless attr_protocols.include?($1.downcase)
else
attr.unlink unless attr_protocols.include?(:relative)
if attr.value =~ REGEX_PROTOCOL
unless attr_protocols.include?($1.downcase)
attr.unlink
next
end
end
else
# The attribute isn't whitelisted.

if allow_data_attributes && attr_name.start_with?('data-')
# Arbitrary data attributes are allowed. Verify that the attribute
# is a valid data attribute.
attr.unlink unless attr_name =~ REGEX_DATA_ATTR
else
# Either the attribute isn't a data attribute, or arbitrary data
# attributes aren't allowed. Remove the attribute.
attr.unlink
unless attr_protocols.include?(:relative)
attr.unlink
next
end
end

# Leading and trailing whitespace around URLs is ignored at parse
# time. Stripping it here prevents it from being escaped by the
# libxml2 workaround below.
attr.value = attr.value.strip
end

# libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
# attempt to preserve server-side includes. This can result in XSS since
# an unescaped double quote can allow an attacker to inject a
# non-whitelisted attribute.
#
# Sanitize works around this by implementing its own escaping for
# affected attributes, some of which can exist on any element and some
# of which can only exist on `<a>` elements.
#
# The relevant libxml2 code is here:
# <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
(name == 'a' && UNSAFE_LIBXML_ATTRS_A.include?(attr_name))

attr.value = attr.value.gsub(UNSAFE_LIBXML_ESCAPE_REGEX, UNSAFE_LIBXML_ESCAPE_CHARS)
end
end
end
Expand Down
12 changes: 11 additions & 1 deletion test/test_clean_element.rb
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@

it 'should not choke on valueless attributes' do
@s.fragment('foo <a href>foo</a> bar')
.must_equal 'foo <a href="" rel="nofollow">foo</a> bar'
.must_equal 'foo <a href rel="nofollow">foo</a> bar'
end

it 'should downcase attribute names' do
Expand Down Expand Up @@ -300,6 +300,16 @@
}).must_equal input
end

it "should not allow relative URLs when relative URLs aren't whitelisted" do
input = '<a href="/foo/bar">Link</a>'

Sanitize.fragment(input,
:elements => ['a'],
:attributes => {'a' => ['href']},
:protocols => {'a' => {'href' => ['http']}}
).must_equal '<a>Link</a>'
end

it 'should allow relative URLs containing colons when the colon is not in the first path segment' do
input = '<a href="/wiki/Special:Random">Random Page</a>'

Expand Down
64 changes: 64 additions & 0 deletions test/test_malicious_html.rb
Original file line number Diff line number Diff line change
Expand Up @@ -125,4 +125,68 @@
must_equal '&lt;alert("XSS");//&lt;'
end
end

# libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
# attempt to preserve server-side includes. This can result in XSS since an
# unescaped double quote can allow an attacker to inject a non-whitelisted
# attribute. Sanitize works around this by implementing its own escaping for
# affected attributes.
#
# The relevant libxml2 code is here:
# <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
describe 'unsafe libxml2 server-side includes in attributes' do
tag_configs = [
{
tag_name: 'a',
escaped_attrs: %w[ action href src name ],
unescaped_attrs: []
},

{
tag_name: 'div',
escaped_attrs: %w[ action href src ],
unescaped_attrs: %w[ name ]
}
]

before do
@s = Sanitize.new({
elements: %w[ a div ],

attributes: {
all: %w[ action href src name ]
}
})
end

tag_configs.each do |tag_config|
tag_name = tag_config[:tag_name]

tag_config[:escaped_attrs].each do |attr_name|
input = %[<#{tag_name} #{attr_name}='examp<!--" onmouseover=alert(1)>-->le.com'>foo</#{tag_name}>]

it 'should escape unsafe characters in attributes' do
@s.fragment(input).must_equal(%[<#{tag_name} #{attr_name}="examp<!--%22%20onmouseover=alert(1)>-->le.com">foo</#{tag_name}>])
end

it 'should round-trip to the same output' do
output = @s.fragment(input)
@s.fragment(output).must_equal(output)
end
end

tag_config[:unescaped_attrs].each do |attr_name|
input = %[<#{tag_name} #{attr_name}='examp<!--" onmouseover=alert(1)>-->le.com'>foo</#{tag_name}>]

it 'should not escape characters unnecessarily' do
@s.fragment(input).must_equal(input)
end

it 'should round-trip to the same output' do
output = @s.fragment(input)
@s.fragment(output).must_equal(output)
end
end
end
end
end

0 comments on commit 01629a1

Please sign in to comment.