From c8faf56c9216afd49b02841dd1dc9d9935b1027a Mon Sep 17 00:00:00 2001 From: Dave Smith Date: Thu, 7 Oct 2021 20:26:00 +0100 Subject: [PATCH 1/7] Fix bug in strip html whereby leading spaces were stripped out --- packages/dom/src/dom/strip-html.js | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/packages/dom/src/dom/strip-html.js b/packages/dom/src/dom/strip-html.js index efd5e2d8784cc..aab6601abd5d6 100644 --- a/packages/dom/src/dom/strip-html.js +++ b/packages/dom/src/dom/strip-html.js @@ -6,9 +6,22 @@ * @return {string} The text content with any html removed. */ export default function stripHTML( html ) { + const leadingSpaces = html.match( /(^\s+)/g ); + + // DOM Parser will ignore any space character coming after + // the DocType. + // see: https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-name-state + // As a result any leading space in the provided `html` + // argument string will be stripped out. + // Manually retrieve these prior to parsing for restoration post-parse. + const spacesToRestore = leadingSpaces ? leadingSpaces[ 0 ] : ''; + const document = new window.DOMParser().parseFromString( html, 'text/html' ); - return document.body.textContent || ''; + + const content = document.body.textContent || ''; + + return spacesToRestore + content; } From ef4961f48b5ff63ad532627e65979f9068a41a49 Mon Sep 17 00:00:00 2001 From: Dave Smith Date: Fri, 8 Oct 2021 11:38:00 +0100 Subject: [PATCH 2/7] At basic unit test coverage --- packages/dom/src/dom/test/strip-html.js | 27 +++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 packages/dom/src/dom/test/strip-html.js diff --git a/packages/dom/src/dom/test/strip-html.js b/packages/dom/src/dom/test/strip-html.js new file mode 100644 index 0000000000000..70acc479ddbfc --- /dev/null +++ b/packages/dom/src/dom/test/strip-html.js @@ -0,0 +1,27 @@ +/** + * Internal dependencies + */ +import stripHTML from '../strip-html'; + +describe( 'stripHTML', () => { + it( 'should strip HTML attributes', () => { + const input = + 'Here is some text that contains HTML markup.'; + const output = 'Here is some text that contains HTML markup.'; + expect( stripHTML( input ) ).toBe( output ); + } ); + + it( 'should preserve leading spaces', () => { + const input = + ' Here is some text with leading spaces.'; + const output = ' Here is some text with leading spaces.'; + expect( stripHTML( input ) ).toBe( output ); + } ); + + it( 'should preserve leading spaces with HTML', () => { + const input = + ' Here is some text with leading spaces.'; + const output = ' Here is some text with leading spaces.'; + expect( stripHTML( input ) ).toBe( output ); + } ); +} ); From 9625b348bde4638631fb26fd01461c7929df911f Mon Sep 17 00:00:00 2001 From: Dave Smith Date: Fri, 8 Oct 2021 11:52:55 +0100 Subject: [PATCH 3/7] Add test for invalid HTML markup --- packages/dom/src/dom/test/strip-html.js | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/packages/dom/src/dom/test/strip-html.js b/packages/dom/src/dom/test/strip-html.js index 70acc479ddbfc..6d4afaa3b4acf 100644 --- a/packages/dom/src/dom/test/strip-html.js +++ b/packages/dom/src/dom/test/strip-html.js @@ -4,13 +4,20 @@ import stripHTML from '../strip-html'; describe( 'stripHTML', () => { - it( 'should strip HTML attributes', () => { + it( 'should strip basic HTML', () => { const input = 'Here is some text that contains HTML markup.'; const output = 'Here is some text that contains HTML markup.'; expect( stripHTML( input ) ).toBe( output ); } ); + it( 'should strip invalid HTML', () => { + const input = + 'Here is some text

that contains HTML markup

.'; + const output = 'Here is some text that contains HTML markup.'; + expect( stripHTML( input ) ).toBe( output ); + } ); + it( 'should preserve leading spaces', () => { const input = ' Here is some text with leading spaces.'; From 0cb643d69fd4be3de5c92ab7a5a8aec71c1c656a Mon Sep 17 00:00:00 2001 From: Dave Smith Date: Fri, 8 Oct 2021 14:21:04 +0100 Subject: [PATCH 4/7] Simplify code and improve regex --- packages/dom/src/dom/strip-html.js | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/packages/dom/src/dom/strip-html.js b/packages/dom/src/dom/strip-html.js index aab6601abd5d6..1e578e3f00c08 100644 --- a/packages/dom/src/dom/strip-html.js +++ b/packages/dom/src/dom/strip-html.js @@ -6,15 +6,14 @@ * @return {string} The text content with any html removed. */ export default function stripHTML( html ) { - const leadingSpaces = html.match( /(^\s+)/g ); - // DOM Parser will ignore any space character coming after // the DocType. // see: https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-name-state // As a result any leading space in the provided `html` // argument string will be stripped out. // Manually retrieve these prior to parsing for restoration post-parse. - const spacesToRestore = leadingSpaces ? leadingSpaces[ 0 ] : ''; + // @ts-ignore + const [ spacesToRestore ] = html.match( /^\s*/ ); const document = new window.DOMParser().parseFromString( html, From b17eac37eae3691b1d2429e1e8516839401d688a Mon Sep 17 00:00:00 2001 From: Dave Smith Date: Fri, 8 Oct 2021 14:21:18 +0100 Subject: [PATCH 5/7] Add test for multi-line HTML --- packages/dom/src/dom/test/strip-html.js | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/packages/dom/src/dom/test/strip-html.js b/packages/dom/src/dom/test/strip-html.js index 6d4afaa3b4acf..c820dc84c06f8 100644 --- a/packages/dom/src/dom/test/strip-html.js +++ b/packages/dom/src/dom/test/strip-html.js @@ -4,7 +4,7 @@ import stripHTML from '../strip-html'; describe( 'stripHTML', () => { - it( 'should strip basic HTML', () => { + it( 'should strip valid HTML', () => { const input = 'Here is some text that contains HTML markup.'; const output = 'Here is some text that contains HTML markup.'; @@ -31,4 +31,19 @@ describe( 'stripHTML', () => { const output = ' Here is some text with leading spaces.'; expect( stripHTML( input ) ).toBe( output ); } ); + + it( 'should preserve new lines in multi-line HTML string', () => { + const input = `
+ Here is some + text + with new lines +
`; + + const output = ` + Here is some + text + with new lines + `; + expect( stripHTML( input ) ).toBe( output ); + } ); } ); From 1f808406238f25b608ba9cbe98bf5b364b0cfa40 Mon Sep 17 00:00:00 2001 From: Dave Smith Date: Fri, 8 Oct 2021 14:24:14 +0100 Subject: [PATCH 6/7] Add test for consequtive spaces within string --- packages/dom/src/dom/test/strip-html.js | 42 +++++++++++++++---------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/packages/dom/src/dom/test/strip-html.js b/packages/dom/src/dom/test/strip-html.js index c820dc84c06f8..f94b433357ea6 100644 --- a/packages/dom/src/dom/test/strip-html.js +++ b/packages/dom/src/dom/test/strip-html.js @@ -18,32 +18,42 @@ describe( 'stripHTML', () => { expect( stripHTML( input ) ).toBe( output ); } ); - it( 'should preserve leading spaces', () => { - const input = - ' Here is some text with leading spaces.'; - const output = ' Here is some text with leading spaces.'; - expect( stripHTML( input ) ).toBe( output ); - } ); + describe( 'whitespace preservation', () => { + it( 'should preserve leading spaces', () => { + const input = + ' Here is some text with leading spaces.'; + const output = ' Here is some text with leading spaces.'; + expect( stripHTML( input ) ).toBe( output ); + } ); - it( 'should preserve leading spaces with HTML', () => { - const input = - ' Here is some text with leading spaces.'; - const output = ' Here is some text with leading spaces.'; - expect( stripHTML( input ) ).toBe( output ); - } ); + it( 'should preserve leading spaces with HTML', () => { + const input = + ' Here is some text with leading spaces.'; + const output = ' Here is some text with leading spaces.'; + expect( stripHTML( input ) ).toBe( output ); + } ); - it( 'should preserve new lines in multi-line HTML string', () => { - const input = `
+ it( 'should preserve consequtive spaces within string', () => { + const input = + 'Here is some text with a lot of spaces inside.'; + const output = + 'Here is some text with a lot of spaces inside.'; + expect( stripHTML( input ) ).toBe( output ); + } ); + + it( 'should preserve new lines in multi-line HTML string', () => { + const input = `
Here is some text with new lines
`; - const output = ` + const output = ` Here is some text with new lines `; - expect( stripHTML( input ) ).toBe( output ); + expect( stripHTML( input ) ).toBe( output ); + } ); } ); } ); From 72111ae22691f9bf1c98f2ad47387d5e49e82baf Mon Sep 17 00:00:00 2001 From: Dave Smith Date: Fri, 8 Oct 2021 14:25:04 +0100 Subject: [PATCH 7/7] Add test for trailing spaces --- packages/dom/src/dom/test/strip-html.js | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/packages/dom/src/dom/test/strip-html.js b/packages/dom/src/dom/test/strip-html.js index f94b433357ea6..429e68cc9b472 100644 --- a/packages/dom/src/dom/test/strip-html.js +++ b/packages/dom/src/dom/test/strip-html.js @@ -33,6 +33,13 @@ describe( 'stripHTML', () => { expect( stripHTML( input ) ).toBe( output ); } ); + it( 'should preserve trailing spaces with HTML', () => { + const input = + 'Here is some text with trailing spaces. '; + const output = 'Here is some text with trailing spaces. '; + expect( stripHTML( input ) ).toBe( output ); + } ); + it( 'should preserve consequtive spaces within string', () => { const input = 'Here is some text with a lot of spaces inside.';