From c28ef0ebbb152d1d02d5cac57fbc5d0b346118c2 Mon Sep 17 00:00:00 2001 From: otegami Date: Thu, 1 Aug 2024 21:33:01 +0800 Subject: [PATCH 1/2] wikipedia-kyoto-japanese-english: increase REXML entity expansion limit during XML parsing Using `Datasets::WikipediaKyotoJapaneseEnglish#each` raised an `entity expansion has grown too large (RuntimeError)`. This error occurs because the entity expansion limit in REXML is set by https://github.com/ruby/rexml/pull/187, and `Datasets::WikipediaKyotoJapaneseEnglish#each` exceeds that limit. In Red Datasets, increasing the entity expansion limit is not a problem because we want to handle large datasets. Therefore, we temporarily increase the limit. How to reproduce: ```console $ cd red-datasets && bundle $ bundle exec ruby example/wikipedia-kyoto-japanese-english.rb ... /home/otegami/.rbenv/versions/3.3.3/lib/ruby/gems/3.3.0/gems/rexml-3.3.4/lib/rexml/parsers/baseparser.rb:560:in `block in unnormalize': entity expansion has grown too large (RuntimeError) ... ``` --- .../wikipedia-kyoto-japanese-english.rb | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/lib/datasets/wikipedia-kyoto-japanese-english.rb b/lib/datasets/wikipedia-kyoto-japanese-english.rb index 227c6d0c..a1b0486c 100644 --- a/lib/datasets/wikipedia-kyoto-japanese-english.rb +++ b/lib/datasets/wikipedia-kyoto-japanese-english.rb @@ -88,8 +88,10 @@ def each(&block) when :article next unless base_name.end_with?(".xml") listener = ArticleListener.new(block) - parser = REXML::Parsers::StreamParser.new(entry.read, listener) - parser.parse + with_increased_entity_expansion_text_limit do + parser = REXML::Parsers::StreamParser.new(entry.read, listener) + parser.parse + end when :lexicon next unless base_name == "kyoto_lexicon.csv" is_header = true @@ -106,6 +108,9 @@ def each(&block) end private + + ENTITY_EXPANSION_TEXT_LIMIT = 163_840 + def download_tar_gz base_name = "wiki_corpus_2.01.tar.gz" data_path = cache_dir_path + base_name @@ -114,6 +119,14 @@ def download_tar_gz data_path end + def with_increased_entity_expansion_text_limit + default_limit = REXML::Security.entity_expansion_text_limit + REXML::Security.entity_expansion_text_limit = ENTITY_EXPANSION_TEXT_LIMIT + yield + ensure + REXML::Security.entity_expansion_text_limit = default_limit + end + class ArticleListener include REXML::StreamListener From e3726fc0a5971d9bd3c9087d5a7062c280b3e8d8 Mon Sep 17 00:00:00 2001 From: otegami Date: Sun, 4 Aug 2024 21:05:15 +0800 Subject: [PATCH 2/2] Increase entity expansion limit only during necessary parsing operations --- lib/datasets/wikipedia-kyoto-japanese-english.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/datasets/wikipedia-kyoto-japanese-english.rb b/lib/datasets/wikipedia-kyoto-japanese-english.rb index a1b0486c..0b1f2c16 100644 --- a/lib/datasets/wikipedia-kyoto-japanese-english.rb +++ b/lib/datasets/wikipedia-kyoto-japanese-english.rb @@ -88,8 +88,8 @@ def each(&block) when :article next unless base_name.end_with?(".xml") listener = ArticleListener.new(block) + parser = REXML::Parsers::StreamParser.new(entry.read, listener) with_increased_entity_expansion_text_limit do - parser = REXML::Parsers::StreamParser.new(entry.read, listener) parser.parse end when :lexicon