From 06d64c67d05007365c453302ac4280c78051cd5e Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Tue, 28 Feb 2023 15:55:32 -0500 Subject: [PATCH] fix(jruby): serializing HTML with no save options emits HTML Previously this emitted XML. this brings the implementation into agreement with the CRuby implementation. --- CHANGELOG.md | 3 +++ ext/java/nokogiri/XmlNode.java | 13 ++++++++-- .../internals/SaveContextVisitor.java | 1 + test/html4/test_document.rb | 25 +++++++++++++++++++ 4 files changed, 40 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5ef0165c628..2d2257b741e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,9 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA ### Fixed +* [JRuby] Serializing an HTML4 document with `#write_to` and specifying no save options will properly emit an HTML document anyway, like libxml2 does. Previously JRuby emitted XML in this situation. + + ### Improved * `Nokogiri::XML::Node::SaveOptions#inspect` now shows the names of the options set in the bitmask, similar to `ParseOptions`. [[#2767](https://github.com/sparklemotion/nokogiri/issues/2767)] diff --git a/ext/java/nokogiri/XmlNode.java b/ext/java/nokogiri/XmlNode.java index 69e67729597..c729f0c441a 100644 --- a/ext/java/nokogiri/XmlNode.java +++ b/ext/java/nokogiri/XmlNode.java @@ -1342,12 +1342,21 @@ public class XmlNode extends RubyObject IRubyObject io = args[0]; IRubyObject encoding = args[1]; IRubyObject indentString = args[2]; - IRubyObject options = args[3]; + IRubyObject options_rb = args[3]; + int options = RubyFixnum.fix2int(options_rb); String encString = rubyStringToString(encoding); + // similar to behavior of libxml2's xmlSaveTree function + if ((options & SaveContextVisitor.AS_XML) == 0 && + (options & SaveContextVisitor.AS_XHTML) == 0 && + (options & SaveContextVisitor.AS_HTML) == 0 && + isHtmlDoc(context)) { + options |= SaveContextVisitor.DEFAULT_HTML; + } + SaveContextVisitor visitor = - new SaveContextVisitor(RubyFixnum.fix2int(options), rubyStringToString(indentString), encString, isHtmlDoc(context), + new SaveContextVisitor(options, rubyStringToString(indentString), encString, isHtmlDoc(context), isFragment(), 0); accept(context, visitor); diff --git a/ext/java/nokogiri/internals/SaveContextVisitor.java b/ext/java/nokogiri/internals/SaveContextVisitor.java index 40708dbbbae..a1ccc348d9b 100644 --- a/ext/java/nokogiri/internals/SaveContextVisitor.java +++ b/ext/java/nokogiri/internals/SaveContextVisitor.java @@ -74,6 +74,7 @@ public class SaveContextVisitor public static final int AS_XML = 32; public static final int AS_HTML = 64; public static final int AS_BUILDER = 128; + public static final int DEFAULT_HTML = NO_DECL | NO_EMPTY | AS_HTML; public static final int CANONICAL = 1; public static final int INCL_NS = 2; diff --git a/test/html4/test_document.rb b/test/html4/test_document.rb index df455983434..af1788477ef 100644 --- a/test/html4/test_document.rb +++ b/test/html4/test_document.rb @@ -755,6 +755,31 @@ def test_leaking_dtd_nodes_after_internal_subset_removal assert_equal(expected, doc.at_css("body").children.map(&:type)) end + it "emits HTML even when no save options are specified" do + doc = Nokogiri::HTML4::Document.parse("
hello
") + expected = doc.to_html + + assert_equal( + expected, + doc.write_to(StringIO.new, save_with: Nokogiri::XML::Node::SaveOptions::DEFAULT_HTML).tap(&:rewind).read, + ) + assert_equal( + expected, + doc.write_to(StringIO.new).tap(&:rewind).read, + ) + + # but not when the AS_XML or AS_XHTML flag is set + as_xml = doc.write_to(StringIO.new, save_with: Nokogiri::XML::Node::SaveOptions::AS_XML).tap(&:rewind).read + pp as_xml + refute_equal(expected, as_xml) + assert(as_xml.start_with?("