diff --git a/budoux/html_processor.py b/budoux/html_processor.py
index 5fb859ca..dc8ba15a 100644
--- a/budoux/html_processor.py
+++ b/budoux/html_processor.py
@@ -15,6 +15,7 @@
import json
import os
+import queue
import typing
from html.parser import HTMLParser
@@ -58,6 +59,7 @@ def __init__(self, chunks: typing.List[str]):
self.chunks_joined = SEP.join(chunks)
self.to_skip = False
self.scan_index = 0
+ self.element_stack: queue.LifoQueue[bool] = queue.LifoQueue()
def handle_starttag(self, tag: str, attrs: HTMLAttr) -> None:
attr_pairs = []
@@ -67,12 +69,17 @@ def handle_starttag(self, tag: str, attrs: HTMLAttr) -> None:
else:
attr_pairs.append(' %s="%s"' % (attr[0], attr[1]))
encoded_attrs = ''.join(attr_pairs)
+ self.element_stack.put(self.to_skip)
+ if tag.upper() in SKIP_NODES:
+ if not self.to_skip and self.chunks_joined[self.scan_index] == SEP:
+ self.scan_index += 1
+ self.output += ''
+ self.to_skip = True
self.output += '<%s%s>' % (tag, encoded_attrs)
- self.to_skip = tag.upper() in SKIP_NODES
def handle_endtag(self, tag: str) -> None:
self.output += '%s>' % (tag)
- self.to_skip = False
+ self.to_skip = self.element_stack.get(block=False)
def handle_data(self, data: str) -> None:
for char in data:
diff --git a/tests/test_html_processor.py b/tests/test_html_processor.py
index 5d769ac6..8963bb94 100644
--- a/tests/test_html_processor.py
+++ b/tests/test_html_processor.py
@@ -64,10 +64,17 @@ def test_with_standard_html_input(self) -> None:
self.assertEqual(result, expected)
def test_with_nodes_to_skip(self) -> None:
- chunks = ['abc', 'def']
- html = "af"
+ chunks = ['abc', 'def', 'ghi']
+ html = "afghi"
+ result = html_processor.resolve(chunks, html)
+ expected = 'afghi'
+ self.assertEqual(result, expected)
+
+ def test_with_break_before_skip(self) -> None:
+ chunks = ['abc', 'def', 'ghi', 'jkl']
+ html = "abcjkl"
result = html_processor.resolve(chunks, html)
- expected = 'af'
+ expected = 'abcjkl'
self.assertEqual(result, expected)
def test_with_nothing_to_split(self) -> None:
diff --git a/tests/test_parser.py b/tests/test_parser.py
index baa0d239..af13230c 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -70,9 +70,12 @@ def test_translate_html_string(self) -> None:
)
input_html = 'xyzxyzabc'
+ # TODO: Because the content for skip elements are included, this test tries
+ # to break before "alert". We may want to distinguish "skip from the
+ # content" and "skip breaking" in future.
expected_html = (
''
- 'xyzxyzabc')
+ 'xyzxyzabc')
output_html = p.translate_html_string(input_html)
self.assertEqual(output_html, expected_html,
'Should pass script tags as is.')
@@ -80,7 +83,7 @@ def test_translate_html_string(self) -> None:
input_html = 'xyzabc
abc'
expected_html = (
''
- 'xyzabc
abc')
+ 'xyzabc
abc')
output_html = p.translate_html_string(input_html)
self.assertEqual(output_html, expected_html,
'Should skip some specific tags.')