Skip to content

Commit

Permalink
Fix a bug that Stream parser doesn't expand the user-defined entity r…
Browse files Browse the repository at this point in the history
…eferences for "text"

## Why?
Pull parser expands character references and predefined entity references, but doesn't expand user-defined entity references.

## Change
- text_stream_unnormalize.rb
```
$LOAD_PATH.unshift(File.expand_path("lib"))
require 'rexml/document'
require 'rexml/parsers/sax2parser'
require 'rexml/parsers/pullparser'
require 'rexml/parsers/streamparser'
require 'rexml/streamlistener'

xml = <<EOS
<!DOCTYPE foo [
  <!ENTITY la "1234">
  <!ENTITY lala "--&la;--">
  <!ENTITY lalal "&la;&la;">
]><root><la>&la;</la><lala>&lala;</lala><a>&lt;P&gt; &lt;I&gt; &lt;B&gt; Text &lt;/B&gt; &lt;/I&gt;</a><b>test&#8482;</b></root>
EOS

class StListener
  include REXML::StreamListener

  def text(text)
    puts text
  end
end

puts "REXML(DOM)"
REXML::Document.new(xml).elements.each("/root/*") {|element| puts element.text}

puts ""
puts "REXML(Pull)"
parser = REXML::Parsers::PullParser.new(xml)
while parser.has_next?
  event = parser.pull
  case event.event_type
  when :text
    puts event[1]
  end
end

puts ""
puts "REXML(Stream)"
parser = REXML::Parsers::StreamParser.new(xml, StListener.new).parse

puts ""
puts "REXML(SAX)"
sax = REXML::Parsers::SAX2Parser.new(xml)
sax.listen(:characters) {|x| puts x }
sax.parse
```

## Before (master)
```
$ ruby  text_stream_unnormalize.rb
REXML(DOM)
1234
--1234--
<P> <I> <B> Text </B> </I>
test™

REXML(Pull)
1234
--1234--
<P> <I> <B> Text </B> </I>
test™

REXML(Stream)
&la;           #<= This
&lala;         #<= This
<P> <I> <B> Text </B> </I>
test™

REXML(SAX)
1234
--1234--
<P> <I> <B> Text </B> </I>
test™
```

After(This PR)

```
$ ruby  text_stream_unnormalize.rb
REXML(DOM)
1234
--1234--
<P> <I> <B> Text </B> </I>
test™

REXML(Pull)
1234
--1234--
<P> <I> <B> Text </B> </I>
test™

REXML(Stream)
1234
--1234--
<P> <I> <B> Text </B> </I>
test™

REXML(SAX)
1234
--1234--
<P> <I> <B> Text </B> </I>
test™
```
  • Loading branch information
naitoh committed Aug 20, 2024
1 parent 68939ea commit 8b97bae
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 1 deletion.
4 changes: 3 additions & 1 deletion lib/rexml/parsers/streamparser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ class StreamParser
def initialize source, listener
@listener = listener
@parser = BaseParser.new( source )
@entities = {}
end

def add_listener( listener )
Expand All @@ -28,7 +29,7 @@ def parse
when :end_element
@listener.tag_end( event[1] )
when :text
unnormalized = @parser.unnormalize( event[1] )
unnormalized = @parser.unnormalize( event[1], @entities )
@listener.text( unnormalized )
when :processing_instruction
@listener.instruction( *event[1,2] )
Expand All @@ -40,6 +41,7 @@ def parse
when :comment, :attlistdecl, :cdata, :xmldecl, :elementdecl
@listener.send( event[0].to_s, *event[1..-1] )
when :entitydecl, :notationdecl
@entities[ event[1] ] = event[2] if event.size == 3
@listener.send( event[0].to_s, event[1..-1] )
when :externalentity
entity_reference = event[1]
Expand Down
19 changes: 19 additions & 0 deletions test/test_stream.rb
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,25 @@ def entity(content)
assert_equal(["ISOLat2"], listener.entities)
end

def test_entity_replacement
source = '<!DOCTYPE foo [
<!ENTITY la "1234">
<!ENTITY lala "--&la;--">
<!ENTITY lalal "&la;&la;">
]><a><la>&la;</la><lala>&lala;</lala></a>'

listener = MyListener.new
class << listener
attr_accessor :text_values
def text(text)
@text_values << text
end
end
listener.text_values = []
REXML::Document.parse_stream(source, listener)
assert_equal(["1234", "--1234--"], listener.text_values)
end

def test_characters_predefined_entities
source = '<root><a>&lt;P&gt; &lt;I&gt; &lt;B&gt; Text &lt;/B&gt; &lt;/I&gt;</a></root>'

Expand Down

0 comments on commit 8b97bae

Please sign in to comment.