From d175bb01314efdcbad5c3cb31b02e298e26c6e19 Mon Sep 17 00:00:00 2001
From: Caitlin Potter
Date: Sat, 5 Jul 2014 01:03:52 -0400
Subject: [PATCH] fix(ngSanitize): follow HTML parser rules for start tags /
allow < in text content
ngSanitize will now permit opening braces in text content, provided they are not followed by either
an unescaped backslash, or by an ASCII letter (u+0041 - u+005A, u+0061 - u+007A), in compliance with
rules of the parsing spec, without taking insertion mode into account.
BREAKING CHANGE
Previously, $sanitize would "fix" invalid markup in which a space preceded alphanumeric characters
in a start-tag. Following this change, any opening angle bracket which is not followed by either a
forward slash, or by an ASCII letter (a-z | A-Z) will not be considered a start tag delimiter, per
the HTML parsing spec (http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html).
Closes #8212
Closes #8193
---
src/ngSanitize/sanitize.js | 22 ++++++++++-----
test/ngSanitize/sanitizeSpec.js | 49 +++++++++++++++++++++++++++------
2 files changed, 56 insertions(+), 15 deletions(-)
diff --git a/src/ngSanitize/sanitize.js b/src/ngSanitize/sanitize.js
index fae50aac259d..de6dd3dd135d 100644
--- a/src/ngSanitize/sanitize.js
+++ b/src/ngSanitize/sanitize.js
@@ -154,11 +154,11 @@ function sanitizeText(chars) {
// Regular Expressions for parsing tags and attributes
var START_TAG_REGEXP =
- /^<\s*([\w:-]+)((?:\s+[\w:-]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)\s*>/,
- END_TAG_REGEXP = /^<\s*\/\s*([\w:-]+)[^>]*>/,
+ /^<((?:[a-zA-Z])[\w:-]*)((?:\s+[\w:-]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)\s*(>?)/,
+ END_TAG_REGEXP = /^<\/\s*([\w:-]+)[^>]*>/,
ATTR_REGEXP = /([\w:-]+)(?:\s*=\s*(?:(?:"((?:[^"])*)")|(?:'((?:[^'])*)')|([^>\s]+)))?/g,
BEGIN_TAG_REGEXP = /^,
- BEGING_END_TAGE_REGEXP = /^<\s*\//,
+ BEGING_END_TAGE_REGEXP = /^<\//,
COMMENT_REGEXP = //g,
DOCTYPE_REGEXP = /]*?)>/i,
CDATA_REGEXP = //g,
@@ -232,10 +232,11 @@ function makeMap(str) {
* @param {object} handler
*/
function htmlParser( html, handler ) {
- var index, chars, match, stack = [], last = html;
+ var index, chars, match, stack = [], last = html, text;
stack.last = function() { return stack[ stack.length - 1 ]; };
while ( html ) {
+ text = '';
chars = true;
// Make sure we're not in a script or style element
@@ -274,16 +275,23 @@ function htmlParser( html, handler ) {
match = html.match( START_TAG_REGEXP );
if ( match ) {
- html = html.substring( match[0].length );
- match[0].replace( START_TAG_REGEXP, parseStartTag );
+ // We only have a valid start-tag if there is a '>'.
+ if ( match[4] ) {
+ html = html.substring( match[0].length );
+ match[0].replace( START_TAG_REGEXP, parseStartTag );
+ }
chars = false;
+ } else {
+ // no ending tag found --- this piece should be encoded as an entity.
+ text += '<';
+ html = html.substring(1);
}
}
if ( chars ) {
index = html.indexOf("<");
- var text = index < 0 ? html : html.substring( 0, index );
+ text += index < 0 ? html : html.substring( 0, index );
html = index < 0 ? "" : html.substring( index );
if (handler.chars) handler.chars( decodeEntities(text) );
diff --git a/test/ngSanitize/sanitizeSpec.js b/test/ngSanitize/sanitizeSpec.js
index 76a48087576e..5f2594397657 100644
--- a/test/ngSanitize/sanitizeSpec.js
+++ b/test/ngSanitize/sanitizeSpec.js
@@ -21,6 +21,7 @@ describe('HTML', function() {
var handler, start, text, comment;
beforeEach(function() {
+ text = "";
handler = {
start: function(tag, attrs, unary){
start = {
@@ -35,7 +36,7 @@ describe('HTML', function() {
});
},
chars: function(text_){
- text = text_;
+ text += text_;
},
end:function(tag) {
expect(tag).toEqual(start.tag);
@@ -81,8 +82,31 @@ describe('HTML', function() {
expect(text).toEqual('text');
});
+ it('should not treat "<" followed by a non-/ or non-letter as a tag', function() {
+ expectHTML('<- text1 text2 <1 text1 text2 <{', handler).
+ toBe('<- text1 text2 <1 text1 text2 <{');
+ });
+
+ it('should throw badparse if text content contains "<" followed by "/" without matching ">"', function() {
+ expect(function() {
+ htmlParser('foo bar', handler);
+ }).toThrowMinErr('$sanitize', 'badparse', 'The sanitizer was unable to parse the following block of html: bar');
+ });
+
+ it('should throw badparse if text content contains "<" followed by an ASCII letter without matching ">"', function() {
+ expect(function() {
+ htmlParser('foo 10 < 100
', handler);
+ expect(text).toEqual(' 10 < 100 ');
+ });
+
it('should parse newlines in tags', function() {
- htmlParser('<\ntag\n attr="value"\n>text<\n/\ntag\n>', handler);
+ htmlParser('text\ntag\n>', handler);
expect(start).toEqual({tag:'tag', attrs:{attr:'value'}, unary:false});
expect(text).toEqual('text');
});
@@ -123,8 +147,9 @@ describe('HTML', function() {
expectHTML('ac.').toEqual('ac.');
});
- it('should remove nested script', function() {
- expectHTML('a< SCRIPT >A< SCRIPT >evil< / scrIpt >B< / scrIpt >c.').toEqual('ac.');
+ it('should escape non-start tags', function() {
+ expectHTML('a< SCRIPT >A< SCRIPT >evil< / scrIpt >B< / scrIpt >c.').
+ toBe('a< SCRIPT >A< SCRIPT >evil< / scrIpt >B< / scrIpt >c.');
});
it('should remove attrs', function() {
@@ -165,14 +190,16 @@ describe('HTML', function() {
expectHTML(everything).toEqual(everything);
});
- it('should handle improper html', function() {
+ it('should mangle improper html', function() {
+ // This text is encoded more than a real HTML parser would, but it should render the same.
expectHTML('< div rel="" alt=abc dir=\'"\' >text< /div>').
- toEqual('text
');
+ toBe('< div rel="" alt=abc dir=\'"\' >text< /div>');
});
- it('should handle improper html2', function() {
+ it('should mangle improper html2', function() {
+ // A proper HTML parser would clobber this more in most cases, but it looks reasonable.
expectHTML('< div rel="" / >').
- toEqual('');
+ toBe('< div rel="" / >');
});
it('should ignore back slash as escape', function() {
@@ -195,6 +222,12 @@ describe('HTML', function() {
expectHTML('\na\n').toEqual('
a
');
});
+ it('should accept tag delimiters such as "<" inside real tags (with nesting)', function() {
+ //this is an integrated version of the 'should accept tag delimiters such as "<" inside real tags' test
+ expectHTML(' 10 < 100
')
+ .toEqual(' 10 < 100
');
+ });
+
describe('htmlSanitizerWriter', function() {
/* global htmlSanitizeWriter: false */
if (angular.isUndefined(window.htmlSanitizeWriter)) return;