diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index b47bad3b..7e0befe9 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -151,25 +151,45 @@ def Text.check string, pattern, doctype end end - # context sensitive - string.scan(pattern) do - if $1[-1] != ?; - raise "Illegal character #{$1.inspect} in raw string #{string.inspect}" - elsif $1[0] == ?& - if $5 and $5[0] == ?# - case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i) - when *VALID_CHAR + pos = 0 + while (index = string.index(/<|&/, pos)) + if string[index] == "<" + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" + end + + unless (end_index = string.index(/[^\s];/, index + 1)) + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" + end + + value = string[(index + 1)..end_index] + if /\s/.match?(value) + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" + end + + if value[0] == "#" + character_reference = value[1..-1] + + unless (/\A(\d+|x[0-9a-fA-F]+)\z/.match?(character_reference)) + if character_reference[0] == "x" || character_reference[-1] == "x" + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" else - raise "Illegal character #{$1.inspect} in raw string #{string.inspect}" + raise "Illegal character #{string.inspect} in raw string #{string.inspect}" end - # FIXME: below can't work but this needs API change. - # elsif @parent and $3 and !SUBSTITUTES.include?($1) - # if !doctype or !doctype.entities.has_key?($3) - # raise "Undeclared entity '#{$1}' in raw string \"#{string}\"" - # end end + + case (character_reference[0] == "x" ? character_reference[1..-1].to_i(16) : character_reference[0..-1].to_i) + when *VALID_CHAR + else + raise "Illegal character #{string.inspect} in raw string #{string.inspect}" + end + elsif !(/\A#{Entity::NAME}\z/um.match?(value)) + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" end + + pos = end_index + 1 end + + string end def node_type diff --git a/test/parse/test_character_reference.rb b/test/parse/test_character_reference.rb new file mode 100644 index 00000000..8ddeccaa --- /dev/null +++ b/test/parse/test_character_reference.rb @@ -0,0 +1,17 @@ +require "test/unit" +require "core_assertions" + +require "rexml/document" + +module REXMLTests + class TestParseCharacterReference < Test::Unit::TestCase + include Test::Unit::CoreAssertions + + def test_gt_linear_performance_many_preceding_zeros + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('') + end + end + end +end