Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

stripHTML - preserve leading and trailing spaces and strip script and on* attributes #35539

Merged
merged 6 commits into from
Oct 22, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 12 additions & 5 deletions packages/dom/src/dom/strip-html.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
/**
* Internal dependencies
*/
import safeHTML from './safe-html';

/**
* Removes any HTML tags from the provided string.
*
Expand All @@ -6,9 +11,11 @@
* @return {string} The text content with any html removed.
*/
export default function stripHTML( html ) {
const document = new window.DOMParser().parseFromString(
html,
'text/html'
);
return document.body.textContent || '';
// Remove any script tags or on* attributes otherwise their *contents* will be left
// in place following removal of HTML tags.
html = safeHTML( html );

const doc = document.implementation.createHTMLDocument( '' );
doc.body.innerHTML = html;
return doc.body.textContent || '';
getdave marked this conversation as resolved.
Show resolved Hide resolved
}
64 changes: 64 additions & 0 deletions packages/dom/src/dom/test/strip-html.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/**
* Internal dependencies
*/
import stripHTML from '../strip-html';

describe( 'stripHTML', () => {
it( 'should strip valid HTML, scripts and on attributes', () => {
const input = `<strong onClick="alert('and on attributes')">Here is some text</strong> that contains <em>HTML markup</em><script>alert("and scripts")</script>.`;
const output = 'Here is some text that contains HTML markup.';
expect( stripHTML( input ) ).toBe( output );
} );

it( 'should strip invalid HTML, scripts and on attributes', () => {
const input = `<strong onClick="alert('and on attributes')">Here is some text</em> <p></div>that contains HTML markup</p><script>alert("and scripts")</script>.`;
const output = 'Here is some text that contains HTML markup.';
expect( stripHTML( input ) ).toBe( output );
} );

describe( 'whitespace preservation', () => {
it( 'should preserve leading spaces', () => {
const input =
' <strong>Here is some text</strong> with <em>leading spaces</em>.';
const output = ' Here is some text with leading spaces.';
expect( stripHTML( input ) ).toBe( output );
} );

it( 'should preserve leading spaces with HTML', () => {
const input =
'<strong> Here is some text</strong> with <em>leading spaces</em>.';
const output = ' Here is some text with leading spaces.';
expect( stripHTML( input ) ).toBe( output );
} );

it( 'should preserve trailing spaces with HTML', () => {
const input =
'<strong>Here is some text</strong> with <em>trailing spaces</em>. ';
const output = 'Here is some text with trailing spaces. ';
expect( stripHTML( input ) ).toBe( output );
} );

it( 'should preserve consecutive spaces within string', () => {
const input =
'<strong>Here is some text</strong> with <em>a lot of spaces inside</em>.';
const output =
'Here is some text with a lot of spaces inside.';
expect( stripHTML( input ) ).toBe( output );
} );

it( 'should preserve new lines in multi-line HTML string', () => {
const input = `<div>
Here is some
<em>text</em>
with new lines
</div>`;

const output = `
Here is some
text
with new lines
`;
expect( stripHTML( input ) ).toBe( output );
} );
} );
} );