From 04f889995b669abc3a4945a6177cc81a9dbff0fa Mon Sep 17 00:00:00 2001
From: Kuba Niegowski <j.niegowski@cksource.com>
Date: Sun, 16 Jan 2022 18:48:22 +0100
Subject: [PATCH 1/2] HTMLDataProcessor should preserve leading HTML comments,
 script and style tags if only content of the document body was provided.

---
 .../src/dataprocessor/htmldataprocessor.js    |  38 +-
 .../tests/dataprocessor/htmldataprocessor.js  | 324 ++++++++++++------
 2 files changed, 221 insertions(+), 141 deletions(-)
diff --git a/packages/ckeditor5-engine/src/dataprocessor/htmldataprocessor.js b/packages/ckeditor5-engine/src/dataprocessor/htmldataprocessor.js
index e88c4c768a0..94566de2c68 100644
--- a/packages/ckeditor5-engine/src/dataprocessor/htmldataprocessor.js
+++ b/packages/ckeditor5-engine/src/dataprocessor/htmldataprocessor.js
@@ -12,8 +12,6 @@
 import BasicHtmlWriter from './basichtmlwriter';
 import DomConverter from '../view/domconverter';
 
-import isComment from '@ckeditor/ckeditor5-utils/src/dom/iscomment';
-
 /**
  * The HTML data processor class.
  * This data processor implementation uses HTML as input and output data.
@@ -116,37 +114,15 @@ export default class HtmlDataProcessor {
 	 * @returns {DocumentFragment}
 	 */
 	_toDom( data ) {
-		const document = this.domParser.parseFromString( data, 'text/html' );
-		const fragment = document.createDocumentFragment();
-
-		// The rules for parsing an HTML string can be read on https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inhtml.
-		//
-		// In short, parsing tokens in an HTML string starts with the so-called "initial" insertion mode. When a DOM parser is in this
-		// state and encounters a comment node, it inserts this comment node as the last child of the newly-created `HTMLDocument` object.
-		// The parser then proceeds to successive insertion modes during parsing subsequent tokens and appends in the `HTMLDocument` object
-		// other nodes (like <html>, <head>, <body>). This causes that the first leading comments from HTML string become the first nodes
-		// in the `HTMLDocument` object, but not in the <body> collection, because they are ultimately located before the <html> element.
-		//
-		// Therefore, so that such leading comments do not disappear, they all are moved from the `HTMLDocument` object to the document
-		// fragment, until the <html> element is encountered.
-		//
-		// See: https://github.com/ckeditor/ckeditor5/issues/9861.
-		let documentChildNode = document.firstChild;
-
-		while ( !documentChildNode.isSameNode( document.documentElement ) ) {
-			const node = documentChildNode;
-
-			documentChildNode = documentChildNode.nextSibling;
-
-			// It seems that `DOMParser#parseFromString()` adds only comment nodes directly to the `HTMLDocument` object, before the <html>
-			// node. The condition below is just to be sure we are moving only comment nodes.
-
-			/* istanbul ignore else */
-			if ( isComment( node ) ) {
-				fragment.appendChild( node );
-			}
+		// Wrap data with a <body> so leading non-layout nodes (like <script>, <style>, HTML comment)
+		// will be preserved in the body collection.
+		// Do it only for data that is not a full HTML document.
+		if ( !data.match( /<(?:html|body|head|meta)(?:\s[^>]*)?>/i ) ) {
+			data = `<body>${ data }</body>`;
 		}
 
+		const document = this.domParser.parseFromString( data, 'text/html' );
+		const fragment = document.createDocumentFragment();
 		const bodyChildNodes = document.body.childNodes;
 
 		while ( bodyChildNodes.length > 0 ) {
diff --git a/packages/ckeditor5-engine/tests/dataprocessor/htmldataprocessor.js b/packages/ckeditor5-engine/tests/dataprocessor/htmldataprocessor.js
index 7a1ed23d6ec..e11b1e6ab69 100644
--- a/packages/ckeditor5-engine/tests/dataprocessor/htmldataprocessor.js
+++ b/packages/ckeditor5-engine/tests/dataprocessor/htmldataprocessor.js
@@ -7,7 +7,7 @@
 
 import HtmlDataProcessor from '../../src/dataprocessor/htmldataprocessor';
 import BasicHtmlWriter from '../../src/dataprocessor/basichtmlwriter';
-import DomConverter from '../../src//view/domconverter';
+import DomConverter from '../../src/view/domconverter';
 import xssTemplates from '../../tests/dataprocessor/_utils/xsstemplates';
 import ViewDocumentFragment from '../../src/view/documentfragment';
 import { stringify, parse } from '../../src/dev-utils/view';
@@ -109,150 +109,254 @@ describe( 'HtmlDataProcessor', () => {
 	} );
 
 	describe( '_toDom()', () => {
-		it( 'should insert nested comment nodes into <body> collection', () => {
-			const bodyDocumentFragment = dataProcessor._toDom(
-				'<div>' +
-					'<!-- Comment 1 -->' +
+		describe( 'HTML fragment without document structure', () => {
+			it( 'should insert nested comment nodes into <body> collection', () => {
+				const bodyDocumentFragment = dataProcessor._toDom(
+					'<div>' +
+						'<!-- Comment 1 -->' +
+						'<p>' +
+							'<!-- Comment 2 -->' +
+							'Paragraph' +
+							'<!-- Comment 3 -->' +
+						'</p>' +
+						'<!-- Comment 4 -->' +
+					'</div>'
+				);
+
+				const [ div ] = bodyDocumentFragment.childNodes;
+				const [ comment1, paragraph, comment4 ] = div.childNodes;
+				const [ comment2, text, comment3 ] = paragraph.childNodes;
+
+				expect( bodyDocumentFragment.childNodes.length ).to.equal( 1 );
+				expect( div.childNodes.length ).to.equal( 3 );
+				expect( paragraph.childNodes.length ).to.equal( 3 );
+
+				expect( comment1.nodeType ).to.equal( Node.COMMENT_NODE );
+				expect( comment1.data ).to.equal( ' Comment 1 ' );
+
+				expect( comment2.nodeType ).to.equal( Node.COMMENT_NODE );
+				expect( comment2.data ).to.equal( ' Comment 2 ' );
+
+				expect( comment3.nodeType ).to.equal( Node.COMMENT_NODE );
+				expect( comment3.data ).to.equal( ' Comment 3 ' );
+
+				expect( comment4.nodeType ).to.equal( Node.COMMENT_NODE );
+				expect( comment4.data ).to.equal( ' Comment 4 ' );
+
+				expect( text.nodeType ).to.equal( Node.TEXT_NODE );
+				expect( text.data ).to.equal( 'Paragraph' );
+
+				expect( div.nodeType ).to.equal( Node.ELEMENT_NODE );
+				expect( div.outerHTML ).to.equal(
+					'<div>' +
+						'<!-- Comment 1 -->' +
+						'<p>' +
+							'<!-- Comment 2 -->' +
+							'Paragraph' +
+							'<!-- Comment 3 -->' +
+						'</p>' +
+						'<!-- Comment 4 -->' +
+					'</div>'
+				);
+
+				expect( paragraph.nodeType ).to.equal( Node.ELEMENT_NODE );
+				expect( paragraph.outerHTML ).to.equal(
 					'<p>' +
 						'<!-- Comment 2 -->' +
 						'Paragraph' +
 						'<!-- Comment 3 -->' +
-					'</p>' +
-					'<!-- Comment 4 -->' +
-				'</div>'
-			);
+					'</p>'
+				);
+			} );
 
-			const [ div ] = bodyDocumentFragment.childNodes;
-			const [ comment1, paragraph, comment4 ] = div.childNodes;
-			const [ comment2, text, comment3 ] = paragraph.childNodes;
+			it( 'should insert leading comment nodes from HTML string into <body> collection', () => {
+				const bodyDocumentFragment = dataProcessor._toDom(
+					'<!-- Comment 1 -->' +
+					'<!-- Comment 2 -->' +
+					'<h2>Heading</h2>' +
+					'<p>Paragraph</p>' +
+					'<!-- Comment 3 -->' +
+					'<!-- Comment 4 -->'
+				);
+
+				const [
+					comment1,
+					comment2,
+					heading,
+					paragraph,
+					comment3,
+					comment4
+				] = bodyDocumentFragment.childNodes;
 
-			expect( bodyDocumentFragment.childNodes.length ).to.equal( 1 );
-			expect( div.childNodes.length ).to.equal( 3 );
-			expect( paragraph.childNodes.length ).to.equal( 3 );
+				expect( bodyDocumentFragment.childNodes.length ).to.equal( 6 );
 
-			expect( comment1.nodeType ).to.equal( Node.COMMENT_NODE );
-			expect( comment1.data ).to.equal( ' Comment 1 ' );
+				expect( comment1.nodeType ).to.equal( Node.COMMENT_NODE );
+				expect( comment1.data ).to.equal( ' Comment 1 ' );
 
-			expect( comment2.nodeType ).to.equal( Node.COMMENT_NODE );
-			expect( comment2.data ).to.equal( ' Comment 2 ' );
+				expect( comment2.nodeType ).to.equal( Node.COMMENT_NODE );
+				expect( comment2.data ).to.equal( ' Comment 2 ' );
 
-			expect( comment3.nodeType ).to.equal( Node.COMMENT_NODE );
-			expect( comment3.data ).to.equal( ' Comment 3 ' );
+				expect( comment3.nodeType ).to.equal( Node.COMMENT_NODE );
+				expect( comment3.data ).to.equal( ' Comment 3 ' );
 
-			expect( comment4.nodeType ).to.equal( Node.COMMENT_NODE );
-			expect( comment4.data ).to.equal( ' Comment 4 ' );
+				expect( comment4.nodeType ).to.equal( Node.COMMENT_NODE );
+				expect( comment4.data ).to.equal( ' Comment 4 ' );
 
-			expect( text.nodeType ).to.equal( Node.TEXT_NODE );
-			expect( text.data ).to.equal( 'Paragraph' );
+				expect( heading.nodeType ).to.equal( Node.ELEMENT_NODE );
+				expect( heading.outerHTML ).to.equal( '<h2>Heading</h2>' );
 
-			expect( div.nodeType ).to.equal( Node.ELEMENT_NODE );
-			expect( div.outerHTML ).to.equal(
-				'<div>' +
+				expect( paragraph.nodeType ).to.equal( Node.ELEMENT_NODE );
+				expect( paragraph.outerHTML ).to.equal( '<p>Paragraph</p>' );
+			} );
+
+			it( 'should insert leading script nodes from HTML string into <body> collection', () => {
+				const bodyDocumentFragment = dataProcessor._toDom(
 					'<!-- Comment 1 -->' +
+					'<!-- Comment 2 -->' +
+					'<h2>Heading</h2>' +
+					'<p>Paragraph</p>' +
+					'<!-- Comment 3 -->' +
+					'<!-- Comment 4 -->'
+				);
+
+				const [
+					comment1,
+					comment2,
+					heading,
+					paragraph,
+					comment3,
+					comment4
+				] = bodyDocumentFragment.childNodes;
+
+				expect( bodyDocumentFragment.childNodes.length ).to.equal( 6 );
+
+				expect( comment1.nodeType ).to.equal( Node.COMMENT_NODE );
+				expect( comment1.data ).to.equal( ' Comment 1 ' );
+
+				expect( comment2.nodeType ).to.equal( Node.COMMENT_NODE );
+				expect( comment2.data ).to.equal( ' Comment 2 ' );
+
+				expect( comment3.nodeType ).to.equal( Node.COMMENT_NODE );
+				expect( comment3.data ).to.equal( ' Comment 3 ' );
+
+				expect( comment4.nodeType ).to.equal( Node.COMMENT_NODE );
+				expect( comment4.data ).to.equal( ' Comment 4 ' );
+
+				expect( heading.nodeType ).to.equal( Node.ELEMENT_NODE );
+				expect( heading.outerHTML ).to.equal( '<h2>Heading</h2>' );
+
+				expect( paragraph.nodeType ).to.equal( Node.ELEMENT_NODE );
+				expect( paragraph.outerHTML ).to.equal( '<p>Paragraph</p>' );
+			} );
+
+			it( 'should preserve leading non-layout elements', () => {
+				const bodyDocumentFragment = dataProcessor._toDom(
+					'<!-- Comment 1 -->' +
+					'<style>#foo { color: red }</style>' +
+					'<script>bar</script>' +
 					'<p>' +
 						'<!-- Comment 2 -->' +
 						'Paragraph' +
-						'<!-- Comment 3 -->' +
-					'</p>' +
-					'<!-- Comment 4 -->' +
-				'</div>'
-			);
+					'</p>'
+				);
 
-			expect( paragraph.nodeType ).to.equal( Node.ELEMENT_NODE );
-			expect( paragraph.outerHTML ).to.equal(
-				'<p>' +
-					'<!-- Comment 2 -->' +
-					'Paragraph' +
-					'<!-- Comment 3 -->' +
-				'</p>'
-			);
-		} );
+				expect( bodyDocumentFragment.childNodes.length ).to.equal( 4 );
 
-		it( 'should insert leading comment nodes from HTML string into <body> collection #1', () => {
-			const bodyDocumentFragment = dataProcessor._toDom(
-				'<!-- Comment 1 -->' +
-				'<!-- Comment 2 -->' +
-				'<h2>Heading</h2>' +
-				'<p>Paragraph</p>' +
-				'<!-- Comment 3 -->' +
-				'<!-- Comment 4 -->'
-			);
+				const [
+					comment1,
+					style,
+					script,
+					paragraph
+				] = bodyDocumentFragment.childNodes;
+
+				expect( comment1.nodeType ).to.equal( Node.COMMENT_NODE );
+				expect( comment1.data ).to.equal( ' Comment 1 ' );
+
+				expect( style.nodeType ).to.equal( Node.ELEMENT_NODE );
+				expect( style.outerHTML ).to.equal( '<style>#foo { color: red }</style>' );
 
-			const [
-				comment1,
-				comment2,
-				heading,
-				paragraph,
-				comment3,
-				comment4
-			] = bodyDocumentFragment.childNodes;
+				expect( script.nodeType ).to.equal( Node.ELEMENT_NODE );
+				expect( script.outerHTML ).to.equal( '<script>bar</script>' );
 
-			expect( bodyDocumentFragment.childNodes.length ).to.equal( 6 );
+				expect( paragraph.nodeType ).to.equal( Node.ELEMENT_NODE );
+				expect( paragraph.outerHTML ).to.equal( '<p><!-- Comment 2 -->Paragraph</p>' );
+			} );
+		} );
 
-			expect( comment1.nodeType ).to.equal( Node.COMMENT_NODE );
-			expect( comment1.data ).to.equal( ' Comment 1 ' );
+		describe( 'full HTML document', () => {
+			it( 'should ignore leading non-layout elements if <html> tag is provided', () => {
+				const bodyDocumentFragment = dataProcessor._toDom(
+					'<html>' +
+						'<!-- Comment 1 -->' +
+						'<style>#foo { color: red }</style>' +
+						'<script>bar</script>' +
+						'<p>' +
+							'<!-- Comment 2 -->' +
+							'Paragraph' +
+						'</p>' +
+					'</html>'
+				);
 
-			expect( comment2.nodeType ).to.equal( Node.COMMENT_NODE );
-			expect( comment2.data ).to.equal( ' Comment 2 ' );
+				expect( bodyDocumentFragment.childNodes.length ).to.equal( 1 );
 
-			expect( comment3.nodeType ).to.equal( Node.COMMENT_NODE );
-			expect( comment3.data ).to.equal( ' Comment 3 ' );
+				const [ paragraph ] = bodyDocumentFragment.childNodes;
+				const [ comment2, text ] = paragraph.childNodes;
 
-			expect( comment4.nodeType ).to.equal( Node.COMMENT_NODE );
-			expect( comment4.data ).to.equal( ' Comment 4 ' );
+				expect( comment2.nodeType ).to.equal( Node.COMMENT_NODE );
+				expect( comment2.data ).to.equal( ' Comment 2 ' );
 
-			expect( heading.nodeType ).to.equal( Node.ELEMENT_NODE );
-			expect( heading.outerHTML ).to.equal( '<h2>Heading</h2>' );
+				expect( text.nodeType ).to.equal( Node.TEXT_NODE );
+				expect( text.data ).to.equal( 'Paragraph' );
+			} );
 
-			expect( paragraph.nodeType ).to.equal( Node.ELEMENT_NODE );
-			expect( paragraph.outerHTML ).to.equal( '<p>Paragraph</p>' );
-		} );
+			it( 'should ignore leading non-layout elements if <body> tag is provided', () => {
+				const bodyDocumentFragment = dataProcessor._toDom(
+					'<!-- Comment 1 -->' +
+					'<style>#foo { color: red }</style>' +
+					'<script>bar</script>' +
+					'<body>' +
+						'<p>' +
+							'<!-- Comment 2 -->' +
+							'Paragraph' +
+						'</p>' +
+					'</body>'
+				);
 
-		it( 'should insert leading comment nodes from HTML string into <body> collection #2', () => {
-			// The existence of the <meta> tag causes that DOMParser inserts this element into the <head>. Moreover, all subsequent comment
-			// nodes (up until the node, that is not valid inside the <head>, which is the <h2> in our case) are also inserted into the
-			// <head>. So both <!-- Comment 3 --> and <!-- Comment 4 --> nodes, that are located between the <meta> and <h2> in the HTML
-			// string, are insterted into the <head>.
-			const bodyDocumentFragment = dataProcessor._toDom(
-				'<!-- Comment 1 -->' +
-				'<!-- Comment 2 -->' +
-				'<meta>' + // inserted into the <head> by DOMParser#parseFromString()
-				'<!-- Comment 3 -->' + // inserted into the <head> by DOMParser#parseFromString()
-				'<!-- Comment 4 -->' + // inserted into the <head> by DOMParser#parseFromString()
-				'<h2>Heading</h2>' +
-				'<p>Paragraph</p>' +
-				'<!-- Comment 5 -->' +
-				'<!-- Comment 6 -->'
-			);
+				expect( bodyDocumentFragment.childNodes.length ).to.equal( 1 );
 
-			const [
-				comment1,
-				comment2,
-				heading,
-				paragraph,
-				comment5,
-				comment6
-			] = bodyDocumentFragment.childNodes;
+				const [ paragraph ] = bodyDocumentFragment.childNodes;
+				const [ comment2, text ] = paragraph.childNodes;
 
-			expect( bodyDocumentFragment.childNodes.length ).to.equal( 6 );
+				expect( comment2.nodeType ).to.equal( Node.COMMENT_NODE );
+				expect( comment2.data ).to.equal( ' Comment 2 ' );
 
-			expect( comment1.nodeType ).to.equal( Node.COMMENT_NODE );
-			expect( comment1.data ).to.equal( ' Comment 1 ' );
+				expect( text.nodeType ).to.equal( Node.TEXT_NODE );
+				expect( text.data ).to.equal( 'Paragraph' );
+			} );
 
-			expect( comment2.nodeType ).to.equal( Node.COMMENT_NODE );
-			expect( comment2.data ).to.equal( ' Comment 2 ' );
+			it( 'should ignore leading non-layout elements if <meya> tag is provided', () => {
+				const bodyDocumentFragment = dataProcessor._toDom(
+					'<meta>' +
+					'<!-- Comment 1 -->' +
+					'<style>#foo { color: red }</style>' +
+					'<script>bar</script>' +
+					'<p>' +
+						'<!-- Comment 2 -->' +
+						'Paragraph' +
+					'</p>'
+				);
 
-			expect( comment5.nodeType ).to.equal( Node.COMMENT_NODE );
-			expect( comment5.data ).to.equal( ' Comment 5 ' );
+				expect( bodyDocumentFragment.childNodes.length ).to.equal( 1 );
 
-			expect( comment6.nodeType ).to.equal( Node.COMMENT_NODE );
-			expect( comment6.data ).to.equal( ' Comment 6 ' );
+				const [ paragraph ] = bodyDocumentFragment.childNodes;
+				const [ comment2, text ] = paragraph.childNodes;
 
-			expect( heading.nodeType ).to.equal( Node.ELEMENT_NODE );
-			expect( heading.outerHTML ).to.equal( '<h2>Heading</h2>' );
+				expect( comment2.nodeType ).to.equal( Node.COMMENT_NODE );
+				expect( comment2.data ).to.equal( ' Comment 2 ' );
 
-			expect( paragraph.nodeType ).to.equal( Node.ELEMENT_NODE );
-			expect( paragraph.outerHTML ).to.equal( '<p>Paragraph</p>' );
+				expect( text.nodeType ).to.equal( Node.TEXT_NODE );
+				expect( text.data ).to.equal( 'Paragraph' );
+			} );
 		} );
 	} );
 

From f17a94152009bcfc5fd8e1aedf691267c32b79f7 Mon Sep 17 00:00:00 2001
From: Aleksander Nowodzinski <a.nowodzinski@cksource.com>
Date: Mon, 17 Jan 2022 11:16:24 +0100
Subject: [PATCH 2/2] Applied review suggestions.

---
 .../ckeditor5-engine/tests/dataprocessor/htmldataprocessor.js   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/ckeditor5-engine/tests/dataprocessor/htmldataprocessor.js b/packages/ckeditor5-engine/tests/dataprocessor/htmldataprocessor.js
index e11b1e6ab69..28565ef3626 100644
--- a/packages/ckeditor5-engine/tests/dataprocessor/htmldataprocessor.js
+++ b/packages/ckeditor5-engine/tests/dataprocessor/htmldataprocessor.js
@@ -334,7 +334,7 @@ describe( 'HtmlDataProcessor', () => {
 				expect( text.data ).to.equal( 'Paragraph' );
 			} );
 
-			it( 'should ignore leading non-layout elements if <meya> tag is provided', () => {
+			it( 'should ignore leading non-layout elements if <meta> tag is provided', () => {
 				const bodyDocumentFragment = dataProcessor._toDom(
 					'<meta>' +
 					'<!-- Comment 1 -->' +