From 67efb5951ed09dbb575c375b130a1e469f437d1f Mon Sep 17 00:00:00 2001 From: Watson Date: Tue, 16 Jul 2024 11:26:57 +0900 Subject: [PATCH] Fix performance issue caused by using repeated `>` characters inside `]>` (#175) A `<` is treated as a string delimiter. In certain cases, if `<` is used in succession, read and match are repeated, which slows down the process. Therefore, the following is used to read ahead to a specific part of the string in advance. --- lib/rexml/parsers/baseparser.rb | 8 ++++++-- test/parse/test_entity_declaration.rb | 7 +++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 4fcdaba7..e8f1a069 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -124,11 +124,15 @@ class BaseParser } module Private - INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um + # Terminal requires two or more letters. INSTRUCTION_TERM = "?>" COMMENT_TERM = "-->" CDATA_TERM = "]]>" DOCTYPE_TERM = "]>" + # Read to the end of DOCTYPE because there is no proper ENTITY termination + ENTITY_TERM = DOCTYPE_TERM + + INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um @@ -313,7 +317,7 @@ def pull_event raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil? return [ :elementdecl, " ]> DETAIL end + + def test_gt_linear_performance + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('' * n + '">') + end + end end end