From a38f629dc354fa76860f7124f44843dff3a08137 Mon Sep 17 00:00:00 2001 From: Koji Ishii Date: Thu, 17 Aug 2023 10:27:26 +0900 Subject: [PATCH] Support non-breaking content in Python (#251) This patch supports non-breaking content in Python. In Java and Python implementations, the "Skip" operation includes the skipped content to the BudouX parser, so no changes to the text for the parser is needed. This patch changes following items: 1. Changed `to_skip` to a stack of elements, rather than always reset to `False` at the end of an element. 2. When there's a phrase boundary right before the "skip" element, insert a break before the "skip" element. Note `` is added to `skip_nodes.json` at: https://github.com/google/budoux/pull/248. --- budoux/html_processor.py | 11 +++++++++-- tests/test_html_processor.py | 13 ++++++++++--- tests/test_parser.py | 7 +++++-- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/budoux/html_processor.py b/budoux/html_processor.py index 5fb859ca..dc8ba15a 100644 --- a/budoux/html_processor.py +++ b/budoux/html_processor.py @@ -15,6 +15,7 @@ import json import os +import queue import typing from html.parser import HTMLParser @@ -58,6 +59,7 @@ def __init__(self, chunks: typing.List[str]): self.chunks_joined = SEP.join(chunks) self.to_skip = False self.scan_index = 0 + self.element_stack: queue.LifoQueue[bool] = queue.LifoQueue() def handle_starttag(self, tag: str, attrs: HTMLAttr) -> None: attr_pairs = [] @@ -67,12 +69,17 @@ def handle_starttag(self, tag: str, attrs: HTMLAttr) -> None: else: attr_pairs.append(' %s="%s"' % (attr[0], attr[1])) encoded_attrs = ''.join(attr_pairs) + self.element_stack.put(self.to_skip) + if tag.upper() in SKIP_NODES: + if not self.to_skip and self.chunks_joined[self.scan_index] == SEP: + self.scan_index += 1 + self.output += '' + self.to_skip = True self.output += '<%s%s>' % (tag, encoded_attrs) - self.to_skip = tag.upper() in SKIP_NODES def handle_endtag(self, tag: str) -> None: self.output += '' % (tag) - self.to_skip = False + self.to_skip = self.element_stack.get(block=False) def handle_data(self, data: str) -> None: for char in data: diff --git a/tests/test_html_processor.py b/tests/test_html_processor.py index 5d769ac6..8963bb94 100644 --- a/tests/test_html_processor.py +++ b/tests/test_html_processor.py @@ -64,10 +64,17 @@ def test_with_standard_html_input(self) -> None: self.assertEqual(result, expected) def test_with_nodes_to_skip(self) -> None: - chunks = ['abc', 'def'] - html = "af" + chunks = ['abc', 'def', 'ghi'] + html = "afghi" + result = html_processor.resolve(chunks, html) + expected = 'afghi' + self.assertEqual(result, expected) + + def test_with_break_before_skip(self) -> None: + chunks = ['abc', 'def', 'ghi', 'jkl'] + html = "abcjkl" result = html_processor.resolve(chunks, html) - expected = 'af' + expected = 'abcjkl' self.assertEqual(result, expected) def test_with_nothing_to_split(self) -> None: diff --git a/tests/test_parser.py b/tests/test_parser.py index baa0d239..af13230c 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -70,9 +70,12 @@ def test_translate_html_string(self) -> None: ) input_html = 'xyzxyzabc' + # TODO: Because the content for skip elements are included, this test tries + # to break before "alert". We may want to distinguish "skip from the + # content" and "skip breaking" in future. expected_html = ( '' - 'xyzxyzabc') + 'xyzxyzabc') output_html = p.translate_html_string(input_html) self.assertEqual(output_html, expected_html, 'Should pass script tags as is.') @@ -80,7 +83,7 @@ def test_translate_html_string(self) -> None: input_html = 'xyzabcabc' expected_html = ( '' - 'xyzabcabc') + 'xyzabcabc') output_html = p.translate_html_string(input_html) self.assertEqual(output_html, expected_html, 'Should skip some specific tags.')