From 7f9bb4bbaab83eed2e04619407a024736a798db2 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sat, 17 Jul 2021 15:04:31 -0400 Subject: [PATCH] feat(cruby): patch libxml2 to handle abruptly-closed HTML comments See guidance provided on abruptly-closed comments here: https://html.spec.whatwg.org/multipage/parsing.html#parse-error-abrupt-closing-of-empty-comment --- CHANGELOG.md | 1 + ...ment-handle-abruptly-closed-comments.patch | 61 +++++++++++++++++++ test/html4/test_comments.rb | 40 ++++++++---- 3 files changed, 91 insertions(+), 11 deletions(-) create mode 100644 patches/libxml2/0008-htmlParseComment-handle-abruptly-closed-comments.patch diff --git a/CHANGELOG.md b/CHANGELOG.md index fdd157d8625..b3b0f97d6e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA ### Improved +* [CRuby] Handle abruptly-closed HTML comments as WHATWG recommends for browsers. (Thanks to HackerOne user [tehryanx](https://hackerone.com/tehryanx?type=user) for reporting this!) * [CRuby] `Node#line` is no longer capped at 65535. libxml v2.9.0 and later support a new parse option, exposed as `Nokogiri::XML::ParseOptions::PARSE_BIG_LINES` and set in `ParseOptions::DEFAULT_XML`, `::DEFAULT_XSLT`, `::DEFAULT_HTML`, and `::DEFAULT_SCHEMA`. (Note that JRuby never had this problem.) [[#1764](https://github.com/sparklemotion/nokogiri/issues/1764), [#1493](https://github.com/sparklemotion/nokogiri/issues/1493), [#1617](https://github.com/sparklemotion/nokogiri/issues/1617), [#1505](https://github.com/sparklemotion/nokogiri/issues/1505), [#1003](https://github.com/sparklemotion/nokogiri/issues/1003), [#533](https://github.com/sparklemotion/nokogiri/issues/533)] diff --git a/patches/libxml2/0008-htmlParseComment-handle-abruptly-closed-comments.patch b/patches/libxml2/0008-htmlParseComment-handle-abruptly-closed-comments.patch new file mode 100644 index 00000000000..72e96c008ee --- /dev/null +++ b/patches/libxml2/0008-htmlParseComment-handle-abruptly-closed-comments.patch @@ -0,0 +1,61 @@ +From 3ea8d08da310b645e37940eaae5cc28e251b155b Mon Sep 17 00:00:00 2001 +From: Mike Dalessio +Date: Sat, 17 Jul 2021 14:36:53 -0400 +Subject: [PATCH] htmlParseComment: handle abruptly-closed comments + +See guidance provided on abrutply-closed comments here: + +https://html.spec.whatwg.org/multipage/parsing.html#parse-error-abrupt-closing-of-empty-comment +--- + HTMLparser.c | 11 +++++++++++ + include/libxml/xmlerror.h | 1 + + 2 files changed, 12 insertions(+) + +diff --git a/HTMLparser.c b/HTMLparser.c +index b56363a..f0bf294 100644 +--- a/HTMLparser.c ++++ b/HTMLparser.c +@@ -3485,10 +3485,20 @@ htmlParseComment(htmlParserCtxtPtr ctxt) { + q = CUR_CHAR(ql); + if (q == 0) + goto unfinished; ++ if (q == '>') { ++ htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL); ++ cur = '>'; ++ goto finished; ++ } + NEXTL(ql); + r = CUR_CHAR(rl); + if (r == 0) + goto unfinished; ++ if (q == '-' && r == '>') { ++ htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL); ++ cur = '>'; ++ goto finished; ++ } + NEXTL(rl); + cur = CUR_CHAR(l); + while ((cur != 0) && +@@ -3536,6 +3546,7 @@ htmlParseComment(htmlParserCtxtPtr ctxt) { + cur = next; + l = nl; + } ++finished: + buf[len] = 0; + if (cur == '>') { + NEXT; +diff --git a/include/libxml/xmlerror.h b/include/libxml/xmlerror.h +index c101997..7b68e40 100644 +--- a/include/libxml/xmlerror.h ++++ b/include/libxml/xmlerror.h +@@ -209,6 +209,7 @@ typedef enum { + XML_ERR_VERSION_MISMATCH, /* 109 */ + XML_ERR_NAME_TOO_LONG, /* 110 */ + XML_ERR_USER_STOP, /* 111 */ ++ XML_ERR_COMMENT_ABRUPTLY_ENDED, /* 112 */ + XML_NS_ERR_XML_NAMESPACE = 200, + XML_NS_ERR_UNDEFINED_NAMESPACE, /* 201 */ + XML_NS_ERR_QNAME, /* 202 */ +-- +2.31.0 + diff --git a/test/html4/test_comments.rb b/test/html4/test_comments.rb index 7d207815f54..95aa9027710 100644 --- a/test/html4/test_comments.rb +++ b/test/html4/test_comments.rb @@ -21,11 +21,20 @@ class TestComment < Nokogiri::TestCase let(:html) { "
" } if Nokogiri.uses_libxml? - it "behaves as if the comment is unterminated and doesn't exist" do # NON-COMPLIANT - assert_equal 0, subject.children.length - assert_equal 1, doc.errors.length - assert_match(/Comment not terminated/, doc.errors.first.to_s) - assert !other_div + if Nokogiri::VersionInfo.instance.libxml2_using_packaged? && Nokogiri::VERSION_INFO["libxml"]["patches"]&.include?("0008-htmlParseComment-handle-abruptly-closed-comments.patch") + it "behaves as if the comment is closed correctly" do # COMPLIANT + assert_equal 1, subject.children.length + assert subject.children.first.comment? + assert_equal "", subject.children.first.content + assert other_div + end + else + it "behaves as if the comment is unterminated and doesn't exist" do # NON-COMPLIANT + assert_equal 0, subject.children.length + assert_equal 1, doc.errors.length + assert_match(/Comment not terminated/, doc.errors.first.to_s) + assert !other_div + end end end @@ -43,11 +52,20 @@ class TestComment < Nokogiri::TestCase let(:html) { "
" } if Nokogiri.uses_libxml? - it "behaves as if the comment is unterminated and doesn't exist" do # NON-COMPLIANT - assert_equal 0, subject.children.length - assert_equal 1, doc.errors.length - assert_match(/Comment not terminated/, doc.errors.first.to_s) - assert !other_div + if Nokogiri::VersionInfo.instance.libxml2_using_packaged? && Nokogiri::VERSION_INFO["libxml"]["patches"]&.include?("0008-htmlParseComment-handle-abruptly-closed-comments.patch") + it "behaves as if the comment is closed correctly" do # COMPLIANT + assert_equal 1, subject.children.length + assert subject.children.first.comment? + assert_equal "", subject.children.first.content + assert other_div + end + else + it "behaves as if the comment is unterminated and doesn't exist" do # NON-COMPLIANT + assert_equal 0, subject.children.length + assert_equal 1, doc.errors.length + assert_match(/Comment not terminated/, doc.errors.first.to_s) + assert !other_div + end end end @@ -55,7 +73,7 @@ class TestComment < Nokogiri::TestCase it "behaves as if the comment is closed correctly" do # COMPLIANT assert_equal 1, subject.children.length assert subject.children.first.comment? - assert_equal "-", subject.children.first.content # curious + assert_equal "-", subject.children.first.content # curious, potentially non-compliant? assert other_div end end