diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 2f068e0c..275372ee 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -132,6 +132,13 @@ module Private GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um + CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/ + CHARACTER_REFERENCES = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ + DEFAULT_ENTITIES_PATTERNS = {} + default_entities = ['gt', 'lt', 'quot', 'apos', 'amp'] + default_entities.each do |term| + DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/ + end end private_constant :Private @@ -504,10 +511,10 @@ def normalize( input, entities=nil, entity_filter=nil ) # Unescapes all possible entities def unnormalize( string, entities=nil, filter=nil ) - rv = string.gsub( /\r\n?/, "\n" ) + rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" ) matches = rv.scan( REFERENCE_RE ) return rv if matches.size == 0 - rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) { + rv.gsub!( Private::CHARACTER_REFERENCES ) { m=$1 m = "0#{m}" if m[0] == ?x [Integer(m)].pack('U*') @@ -518,7 +525,7 @@ def unnormalize( string, entities=nil, filter=nil ) unless filter and filter.include?(entity_reference) entity_value = entity( entity_reference, entities ) if entity_value - re = /&#{entity_reference};/ + re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/ rv.gsub!( re, entity_value ) else er = DEFAULT_ENTITIES[entity_reference] @@ -526,7 +533,7 @@ def unnormalize( string, entities=nil, filter=nil ) end end end - rv.gsub!( /&/, '&' ) + rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' ) end rv end diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb index 53a985ba..b6a48c93 100644 --- a/test/test_pullparser.rb +++ b/test/test_pullparser.rb @@ -62,6 +62,26 @@ def test_entity_replacement end end + def test_character_references + source = 'AB' + parser = REXML::Parsers::PullParser.new( source ) + element_name = '' + while parser.has_next? + event = parser.pull + case event.event_type + when :start_element + element_name = event[0] + when :text + case element_name + when 'a' + assert_equal('A', event[1]) + when 'b' + assert_equal('B', event[1]) + end + end + end + end + def test_peek_unshift source = "" REXML::Parsers::PullParser.new(source)