diff --git a/budoux/html_processor.py b/budoux/html_processor.py index 5fb859ca..dc8ba15a 100644 --- a/budoux/html_processor.py +++ b/budoux/html_processor.py @@ -15,6 +15,7 @@ import json import os +import queue import typing from html.parser import HTMLParser @@ -58,6 +59,7 @@ def __init__(self, chunks: typing.List[str]): self.chunks_joined = SEP.join(chunks) self.to_skip = False self.scan_index = 0 + self.element_stack: queue.LifoQueue[bool] = queue.LifoQueue() def handle_starttag(self, tag: str, attrs: HTMLAttr) -> None: attr_pairs = [] @@ -67,12 +69,17 @@ def handle_starttag(self, tag: str, attrs: HTMLAttr) -> None: else: attr_pairs.append(' %s="%s"' % (attr[0], attr[1])) encoded_attrs = ''.join(attr_pairs) + self.element_stack.put(self.to_skip) + if tag.upper() in SKIP_NODES: + if not self.to_skip and self.chunks_joined[self.scan_index] == SEP: + self.scan_index += 1 + self.output += '' + self.to_skip = True self.output += '<%s%s>' % (tag, encoded_attrs) - self.to_skip = tag.upper() in SKIP_NODES def handle_endtag(self, tag: str) -> None: self.output += '' % (tag) - self.to_skip = False + self.to_skip = self.element_stack.get(block=False) def handle_data(self, data: str) -> None: for char in data: diff --git a/tests/test_html_processor.py b/tests/test_html_processor.py index 5d769ac6..8963bb94 100644 --- a/tests/test_html_processor.py +++ b/tests/test_html_processor.py @@ -64,10 +64,17 @@ def test_with_standard_html_input(self) -> None: self.assertEqual(result, expected) def test_with_nodes_to_skip(self) -> None: - chunks = ['abc', 'def'] - html = "af" + chunks = ['abc', 'def', 'ghi'] + html = "afghi" + result = html_processor.resolve(chunks, html) + expected = 'afghi' + self.assertEqual(result, expected) + + def test_with_break_before_skip(self) -> None: + chunks = ['abc', 'def', 'ghi', 'jkl'] + html = "abcjkl" result = html_processor.resolve(chunks, html) - expected = 'af' + expected = 'abcjkl' self.assertEqual(result, expected) def test_with_nothing_to_split(self) -> None: diff --git a/tests/test_parser.py b/tests/test_parser.py index baa0d239..af13230c 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -70,9 +70,12 @@ def test_translate_html_string(self) -> None: ) input_html = 'xyzxyzabc' + # TODO: Because the content for skip elements are included, this test tries + # to break before "alert". We may want to distinguish "skip from the + # content" and "skip breaking" in future. expected_html = ( '' - 'xyzxyzabc') + 'xyzxyzabc') output_html = p.translate_html_string(input_html) self.assertEqual(output_html, expected_html, 'Should pass script tags as is.') @@ -80,7 +83,7 @@ def test_translate_html_string(self) -> None: input_html = 'xyzabcabc' expected_html = ( '' - 'xyzabcabc') + 'xyzabcabc') output_html = p.translate_html_string(input_html) self.assertEqual(output_html, expected_html, 'Should skip some specific tags.')