Skip to content

Commit

Permalink
Support UTF-32 Encoding
Browse files Browse the repository at this point in the history
GitHub: fix #212

Co-authored-by: Sutou Kouhei <[email protected]>
  • Loading branch information
naitoh and kou committed Oct 9, 2024
1 parent 206666c commit 21d0c1d
Show file tree
Hide file tree
Showing 9 changed files with 125 additions and 11 deletions.
2 changes: 2 additions & 0 deletions lib/rexml/document.rb
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,8 @@ def version
# Returns the XMLDecl encoding of the document,
# if it has been set, otherwise the default encoding:
#
# d = REXML::Document.new('<?xml version="1.0" encoding="UTF-32"?>')
# d.encoding # => "UTF-32"
# d = REXML::Document.new('<?xml version="1.0" encoding="UTF-16"?>')
# d.encoding # => "UTF-16"
# d = REXML::Document.new('')
Expand Down
5 changes: 4 additions & 1 deletion lib/rexml/output.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@ def initialize real_IO, encd="iso-8859-1"

@to_utf = encoding != 'UTF-8'

if encoding == "UTF-16"
if encoding == "UTF-32"
@output << "\ufeff".encode("UTF-32BE")
self.encoding = "UTF-32BE"
elsif encoding == "UTF-16"
@output << "\ufeff".encode("UTF-16BE")
self.encoding = "UTF-16BE"
end
Expand Down
10 changes: 7 additions & 3 deletions lib/rexml/parsers/baseparser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -640,7 +640,7 @@ def record_entity_expansion(delta=1)

def need_source_encoding_update?(xml_declaration_encoding)
return false if xml_declaration_encoding.nil?
return false if /\AUTF-16\z/i =~ xml_declaration_encoding
return false if /\AUTF-(32|16)\z/i =~ xml_declaration_encoding
true
end

Expand Down Expand Up @@ -748,8 +748,12 @@ def process_instruction
if need_source_encoding_update?(encoding)
@source.encoding = encoding
end
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
encoding = "UTF-16"
if encoding.nil?
if /\AUTF-32(?:BE|LE)\z/i =~ @source.encoding
encoding = "UTF-32"
elsif encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
encoding = "UTF-16"
end
end
standalone = STANDALONE.match(content)
standalone = standalone[1] unless standalone.nil?
Expand Down
10 changes: 7 additions & 3 deletions lib/rexml/source.rb
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,11 @@ def detect_encoding
detected_encoding = "UTF-8"
begin
@scanner.string.force_encoding("ASCII-8BIT")
if @scanner.scan(/\xfe\xff/n)
if @scanner.scan(/\x00\x00\xfe\xff/n)
detected_encoding = "UTF-32BE"
elsif @scanner.scan(/\xff\xfe\x00\x00/n)
detected_encoding = "UTF-32LE"
elsif @scanner.scan(/\xfe\xff/n)
detected_encoding = "UTF-16BE"
elsif @scanner.scan(/\xff\xfe/n)
detected_encoding = "UTF-16LE"
Expand Down Expand Up @@ -192,7 +196,7 @@ def initialize(arg, block_size=500, encoding=nil)
if encoding
super("", encoding)
else
super(@source.read(3) || "")
super(@source.read(4) || "")
end

if !@to_utf and
Expand Down Expand Up @@ -321,7 +325,7 @@ def readline(term = nil)

def encoding_updated
case @encoding
when "UTF-16BE", "UTF-16LE"
when "UTF-32BE", "UTF-32LE", "UTF-16BE", "UTF-16LE"
@source.binmode
@source.set_encoding(@encoding, @encoding)
end
Expand Down
Binary file added test/data/utf32.xml
Binary file not shown.
4 changes: 2 additions & 2 deletions test/parse/test_text.rb
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ def test_before_root
assert_equal(<<~DETAIL.chomp, exception.to_s)
Malformed XML: Content at the start of the document (got 'b')
Line: 1
Position: 4
Position: 8
Last 80 unconsumed characters:
<a>
<a></a>
DETAIL
end

Expand Down
5 changes: 5 additions & 0 deletions test/test_core.rb
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,11 @@ def test_xmldecl_utf_16be_encoding_name
XMLDecl.new("1.0", "UTF-16").to_s)
end

def test_xmldecl_utf_32be_encoding_name
assert_equal("<?xml version='1.0' encoding='UTF-32'?>",
XMLDecl.new("1.0", "UTF-32").to_s)
end

def each_test( element, xpath, num_children )
count = 0
element.each_element( xpath ) { |child|
Expand Down
81 changes: 80 additions & 1 deletion test/test_document.rb
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,26 @@ def test_utf_16be
document = REXML::Document.new(bom + xml)
assert_equal("UTF-16", document.encoding)
end

def test_utf_32le
xml = <<-EOX.encode("UTF-32LE").force_encoding("ASCII-8BIT")
<?xml version="1.0" encoding="UTF-32"?>
<message>Hello world!</message>
EOX
bom = "\ufeff".encode("UTF-32LE").force_encoding("ASCII-8BIT")
document = REXML::Document.new(bom + xml)
assert_equal("UTF-32", document.encoding)
end

def test_utf_32be
xml = <<-EOX.encode("UTF-32BE").force_encoding("ASCII-8BIT")
<?xml version="1.0" encoding="UTF-32"?>
<message>Hello world!</message>
EOX
bom = "\ufeff".encode("UTF-32BE").force_encoding("ASCII-8BIT")
document = REXML::Document.new(bom + xml)
assert_equal("UTF-32", document.encoding)
end
end

class NoEncodingTest < self
Expand Down Expand Up @@ -383,6 +403,26 @@ def test_utf_16be
document = REXML::Document.new(bom + xml)
assert_equal("UTF-16", document.encoding)
end

def test_utf_32le
xml = <<-EOX.encode("UTF-32LE").force_encoding("ASCII-8BIT")
<?xml version="1.0"?>
<message>Hello world!</message>
EOX
bom = "\ufeff".encode("UTF-32LE").force_encoding("ASCII-8BIT")
document = REXML::Document.new(bom + xml)
assert_equal("UTF-32", document.encoding)
end

def test_utf_32be
xml = <<-EOX.encode("UTF-32BE").force_encoding("ASCII-8BIT")
<?xml version="1.0"?>
<message>Hello world!</message>
EOX
bom = "\ufeff".encode("UTF-32BE").force_encoding("ASCII-8BIT")
document = REXML::Document.new(bom + xml)
assert_equal("UTF-32", document.encoding)
end
end

class WriteTest < self
Expand All @@ -399,13 +439,30 @@ def test_utf_16
expected_xml = <<-EOX.chomp.encode("UTF-16BE")
\ufeff<?xml version='1.0' encoding='UTF-16'?>
<message>Hello world!</message>
EOX
assert_equal(expected_xml, actual_xml)
end

def test_utf_32
xml = <<-EOX.encode("UTF-32LE").force_encoding("ASCII-8BIT")
<?xml version="1.0"?>
<message>Hello world!</message>
EOX
bom = "\ufeff".encode("UTF-32LE").force_encoding("ASCII-8BIT")
document = REXML::Document.new(bom + xml)

actual_xml = ""
document.write(actual_xml)
expected_xml = <<-EOX.chomp.encode("UTF-32BE")
\ufeff<?xml version='1.0' encoding='UTF-32'?>
<message>Hello world!</message>
EOX
assert_equal(expected_xml, actual_xml)
end
end

class ReadUntilTest < Test::Unit::TestCase
def test_utf_8
def test_utf_8
xml = <<-EOX.force_encoding("ASCII-8BIT")
<?xml version="1.0" encoding="UTF-8"?>
<message testing=">">Hello world!</message>
Expand Down Expand Up @@ -436,6 +493,28 @@ def test_utf_16be
assert_equal("UTF-16", document.encoding)
assert_equal(">", REXML::XPath.match(document, "/message")[0].attribute("testing").value)
end

def test_utf_32le
xml = <<-EOX.encode("UTF-32LE").force_encoding("ASCII-8BIT")
<?xml version="1.0" encoding="UTF-32"?>
<message testing=">">Hello world!</message>
EOX
bom = "\ufeff".encode("UTF-32LE").force_encoding("ASCII-8BIT")
document = REXML::Document.new(bom + xml)
assert_equal("UTF-32", document.encoding)
assert_equal(">", REXML::XPath.match(document, "/message")[0].attribute("testing").value)
end

def test_utf_32be
xml = <<-EOX.encode("UTF-32BE").force_encoding("ASCII-8BIT")
<?xml version="1.0" encoding="UTF-32"?>
<message testing=">">Hello world!</message>
EOX
bom = "\ufeff".encode("UTF-32BE").force_encoding("ASCII-8BIT")
document = REXML::Document.new(bom + xml)
assert_equal("UTF-32", document.encoding)
assert_equal(">", REXML::XPath.match(document, "/message")[0].attribute("testing").value)
end
end
end
end
Expand Down
19 changes: 18 additions & 1 deletion test/test_encoding.rb
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,15 @@ def test_parse_utf16
REXML::Document.new(f)
end
assert_equal("UTF-16", utf16.encoding)
assert( utf16[0].kind_of?(REXML::XMLDecl))
assert(utf16[0].kind_of?(REXML::XMLDecl))
end

def test_parse_utf32
utf32 = File.open(fixture_path("utf32.xml")) do |f|
REXML::Document.new(f)
end
assert_equal("UTF-32", utf32.encoding)
assert(utf32[0].kind_of?(REXML::XMLDecl))
end

def test_parse_utf16_with_utf8_default_internal
Expand All @@ -103,5 +111,14 @@ def test_parse_utf16_with_utf8_default_internal
assert_equal("UTF-16", utf16.encoding)
end
end

def test_parse_utf32_with_utf8_default_internal
with_default_internal("UTF-8") do
utf32 = File.open(fixture_path("utf32.xml")) do |f|
REXML::Document.new(f)
end
assert_equal("UTF-32", utf32.encoding)
end
end
end
end

0 comments on commit 21d0c1d

Please sign in to comment.