diff --git a/CHANGELOG.md b/CHANGELOG.md index 7d315c2ae17..ba9a39dbfbe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,11 +10,15 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA ### Added +* Serialization methods like `#to_xml`, `#to_html`, `#serialize`, and `#write_to` now accept `Encoding` objects specifying the output encoding. Previously only encoding names (strings) were accepted. [[#2774](https://github.com/sparklemotion/nokogiri/issues/2774), [#2798](https://github.com/sparklemotion/nokogiri/issues/2798)] (Thanks, [@ellaklara](https://github.com/ellaklara)!) + + ### Changed ### Fixed * [JRuby] Serializing an HTML4 document with `#write_to` and specifying no save options will properly emit an HTML document anyway, like libxml2 does. Previously JRuby emitted XML in this situation. +* [JRuby] Serializing with `#write_to` will fall back to the document encoding when no encoding is specified, like libxml2 does. Previously JRuby emitted UTF-8 in this situation. ### Improved diff --git a/lib/nokogiri/html5/node.rb b/lib/nokogiri/html5/node.rb index 5edbb366574..b2773868b57 100644 --- a/lib/nokogiri/html5/node.rb +++ b/lib/nokogiri/html5/node.rb @@ -17,6 +17,9 @@ # limitations under the License. # +# +# TODO: this whole file should go away. maybe make it a decorator? +# require_relative "../xml/node" module Nokogiri @@ -50,6 +53,8 @@ def write_to(io, *options) config = XML::Node::SaveOptions.new(save_options.to_i) yield config if block_given? + encoding = encoding.is_a?(Encoding) ? encoding.name : encoding + config_options = config.options if config_options & (XML::Node::SaveOptions::AS_XML | XML::Node::SaveOptions::AS_XHTML) != 0 # Use Nokogiri's serializing code. diff --git a/lib/nokogiri/xml/node.rb b/lib/nokogiri/xml/node.rb index e2ca9372976..d106651e6de 100644 --- a/lib/nokogiri/xml/node.rb +++ b/lib/nokogiri/xml/node.rb @@ -1269,11 +1269,11 @@ def <=>(other) # # These two statements are equivalent: # - # node.serialize(:encoding => 'UTF-8', :save_with => FORMAT | AS_XML) + # node.serialize(encoding: 'UTF-8', save_with: FORMAT | AS_XML) # # or # - # node.serialize(:encoding => 'UTF-8') do |config| + # node.serialize(encoding: 'UTF-8') do |config| # config.format.as_xml # end # @@ -1310,7 +1310,7 @@ def to_html(options = {}) ### # Serialize this Node to XML using +options+ # - # doc.to_xml(:indent => 5, :encoding => 'UTF-8') + # doc.to_xml(indent: 5, encoding: 'UTF-8') # # See Node#write_to for a list of +options+ def to_xml(options = {}) @@ -1321,7 +1321,7 @@ def to_xml(options = {}) ### # Serialize this Node to XHTML using +options+ # - # doc.to_xhtml(:indent => 5, :encoding => 'UTF-8') + # doc.to_xhtml(indent: 5, encoding: 'UTF-8') # # See Node#write_to for a list of +options+ def to_xhtml(options = {}) @@ -1329,25 +1329,32 @@ def to_xhtml(options = {}) end ### - # Write Node to +io+ with +options+. +options+ modify the output of - # this method. Valid options are: + # :call-seq: + # write_to(io, *options) + # + # Serialize this node or document to +io+. # - # * +:encoding+ for changing the encoding - # * +:indent_text+ the indentation text, defaults to one space - # * +:indent+ the number of +:indent_text+ to use, defaults to 2 - # * +:save_with+ a combination of SaveOptions constants. + # [Parameters] + # - +io+ (IO) An IO-like object to which the serialized content will be written. + # - +options+ (Hash) See below + # + # [Options] + # * +:encoding+ (String or Encoding) specify the encoding of the output (defaults to document encoding) + # * +:indent_text+ (String) the indentation text (defaults to " ") + # * +:indent+ (Integer) the number of +:indent_text+ to use (defaults to +2+) + # * +:save_with+ (Integer) a combination of SaveOptions constants # # To save with UTF-8 indented twice: # - # node.write_to(io, :encoding => 'UTF-8', :indent => 2) + # node.write_to(io, encoding: 'UTF-8', indent: 2) # # To save indented with two dashes: # - # node.write_to(io, :indent_text => '-', :indent => 2) + # node.write_to(io, indent_text: '-', indent: 2) # def write_to(io, *options) options = options.first.is_a?(Hash) ? options.shift : {} - encoding = options[:encoding] || options[0] + encoding = options[:encoding] || options[0] || document.encoding if Nokogiri.jruby? save_options = options[:save_with] || options[1] indent_times = options[:indent] || 0 @@ -1365,6 +1372,8 @@ def write_to(io, *options) config = SaveOptions.new(save_options.to_i) yield config if block_given? + encoding = encoding.is_a?(Encoding) ? encoding.name : encoding + native_write_to(io, encoding, indentation, config.options) end diff --git a/test/test_serialization_encoding.rb b/test/test_serialization_encoding.rb new file mode 100644 index 00000000000..29ed10e0139 --- /dev/null +++ b/test/test_serialization_encoding.rb @@ -0,0 +1,114 @@ +# coding: utf-8 +# frozen_string_literal: true + +require "helper" + +class TestSerializationEncoding < Nokogiri::TestCase + def round_trip_through_file + Tempfile.create do |io| + yield io + io.rewind + io.read + end + end + + describe "serialization encoding" do + matrix = [ + { + klass: Nokogiri::XML::Document, + documents: [ + { encoding: Encoding::UTF_8, path: ADDRESS_XML_FILE }, + { encoding: Encoding::Shift_JIS, path: SHIFT_JIS_XML }, + ], + }, + { + klass: Nokogiri::HTML4::Document, + documents: [ + { encoding: Encoding::UTF_8, path: HTML_FILE }, + { encoding: Encoding::Shift_JIS, path: SHIFT_JIS_HTML }, + ], + }, + ] + if Nokogiri.uses_gumbo? + matrix << { + klass: Nokogiri::HTML5::Document, + documents: [ + { encoding: Encoding::UTF_8, path: HTML_FILE }, + { encoding: Encoding::Shift_JIS, path: SHIFT_JIS_HTML }, + ], + } + end + + matrix.each do |matrix_entry| + describe matrix_entry[:klass] do + let(:klass) { matrix_entry[:klass] } + matrix_entry[:documents].each do |document| + describe document[:encoding] do + it "serializes with the expected encoding" do + doc = klass.parse( + File.read( + document[:path], + encoding: document[:encoding], + ), + ) + + expected_default_encoding = + if defined?(Nokogiri::HTML5::Document) && klass == Nokogiri::HTML5::Document + Encoding::UTF_8 # FIXME: see #2801, this should be document[:encoding] + else + document[:encoding] + end + + assert_equal(expected_default_encoding, doc.to_s.encoding) + + assert_equal(expected_default_encoding, doc.to_xml.encoding) + assert_equal(Encoding::UTF_8, doc.to_xml(encoding: "UTF-8").encoding) + assert_equal(Encoding::Shift_JIS, doc.to_xml(encoding: "SHIFT_JIS").encoding) + assert_equal(Encoding::UTF_8, doc.to_xml(encoding: Encoding::UTF_8).encoding) + assert_equal(Encoding::Shift_JIS, doc.to_xml(encoding: Encoding::Shift_JIS).encoding) + + assert_equal(expected_default_encoding, doc.to_xhtml.encoding) + assert_equal(Encoding::UTF_8, doc.to_xhtml(encoding: "UTF-8").encoding) + assert_equal(Encoding::Shift_JIS, doc.to_xhtml(encoding: "SHIFT_JIS").encoding) + assert_equal(Encoding::UTF_8, doc.to_xhtml(encoding: Encoding::UTF_8).encoding) + assert_equal(Encoding::Shift_JIS, doc.to_xhtml(encoding: Encoding::Shift_JIS).encoding) + + assert_equal(expected_default_encoding, doc.to_html.encoding) + assert_equal(Encoding::UTF_8, doc.to_html(encoding: "UTF-8").encoding) + assert_equal(Encoding::Shift_JIS, doc.to_html(encoding: "SHIFT_JIS").encoding) + assert_equal(Encoding::UTF_8, doc.to_html(encoding: Encoding::UTF_8).encoding) + assert_equal(Encoding::Shift_JIS, doc.to_html(encoding: Encoding::Shift_JIS).encoding) + + assert_equal(expected_default_encoding, doc.serialize.encoding) + assert_equal(Encoding::UTF_8, doc.serialize(encoding: "UTF-8").encoding) + assert_equal(Encoding::Shift_JIS, doc.serialize(encoding: "SHIFT_JIS").encoding) + assert_equal(Encoding::UTF_8, doc.serialize(encoding: Encoding::UTF_8).encoding) + assert_equal(Encoding::Shift_JIS, doc.serialize(encoding: Encoding::Shift_JIS).encoding) + + assert_equal( + doc.serialize.bytes, + round_trip_through_file { |io| doc.write_to(io) }.bytes, + ) + assert_equal( + doc.serialize(encoding: "UTF-8").bytes, + round_trip_through_file { |io| doc.write_to(io, encoding: "UTF-8") }.bytes, + ) + assert_equal( + doc.serialize(encoding: "SHIFT_JIS").bytes, + round_trip_through_file { |io| doc.write_to(io, encoding: "SHIFT_JIS") }.bytes, + ) + assert_equal( + doc.serialize(encoding: "UTF-8").bytes, + round_trip_through_file { |io| doc.write_to(io, encoding: Encoding::UTF_8) }.bytes, + ) + assert_equal( + doc.serialize(encoding: "Shift_JIS").bytes, + round_trip_through_file { |io| doc.write_to(io, encoding: Encoding::Shift_JIS) }.bytes, + ) + end + end + end + end + end + end +end