Support non-breaking content in Python (#251)

This patch supports non-breaking content in Python. In Java and Python implementations, the "Skip" operation includes the skipped content to the BudouX parser, so no changes to the text for the parser is needed. This patch changes following items: 1. Changed `to_skip` to a stack of elements, rather than always reset to `False` at the end of an element. 2. When there's a phrase boundary right before the "skip" element, insert a break before the "skip" element. Note `<NOBR>` is added to `skip_nodes.json` at: #248.
google · Aug 17, 2023 · a38f629 · a38f629
1 parent a448046
commit a38f629
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 7 deletions.
diff --git a/budoux/html_processor.py b/budoux/html_processor.py
@@ -15,6 +15,7 @@
 
 import json
 import os
+import queue
 import typing
 from html.parser import HTMLParser
 
@@ -58,6 +59,7 @@ def __init__(self, chunks: typing.List[str]):
     self.chunks_joined = SEP.join(chunks)
     self.to_skip = False
     self.scan_index = 0
+    self.element_stack: queue.LifoQueue[bool] = queue.LifoQueue()
 
   def handle_starttag(self, tag: str, attrs: HTMLAttr) -> None:
     attr_pairs = []
@@ -67,12 +69,17 @@ def handle_starttag(self, tag: str, attrs: HTMLAttr) -> None:
       else:
         attr_pairs.append(' %s="%s"' % (attr[0], attr[1]))
     encoded_attrs = ''.join(attr_pairs)
+    self.element_stack.put(self.to_skip)
+    if tag.upper() in SKIP_NODES:
+      if not self.to_skip and self.chunks_joined[self.scan_index] == SEP:
+        self.scan_index += 1
+        self.output += '<wbr>'
+      self.to_skip = True
     self.output += '<%s%s>' % (tag, encoded_attrs)
-    self.to_skip = tag.upper() in SKIP_NODES
 
   def handle_endtag(self, tag: str) -> None:
     self.output += '</%s>' % (tag)
-    self.to_skip = False
+    self.to_skip = self.element_stack.get(block=False)
 
   def handle_data(self, data: str) -> None:
     for char in data:

diff --git a/tests/test_html_processor.py b/tests/test_html_processor.py
@@ -64,10 +64,17 @@ def test_with_standard_html_input(self) -> None:
     self.assertEqual(result, expected)
 
   def test_with_nodes_to_skip(self) -> None:
-    chunks = ['abc', 'def']
-    html = "a<button>bcde</button>f"
+    chunks = ['abc', 'def', 'ghi']
+    html = "a<button>bcde</button>fghi"
+    result = html_processor.resolve(chunks, html)
+    expected = '<span style="word-break: keep-all; overflow-wrap: anywhere;">a<button>bcde</button>f<wbr>ghi</span>'
+    self.assertEqual(result, expected)
+
+  def test_with_break_before_skip(self) -> None:
+    chunks = ['abc', 'def', 'ghi', 'jkl']
+    html = "abc<button>defghi</button>jkl"
     result = html_processor.resolve(chunks, html)
-    expected = '<span style="word-break: keep-all; overflow-wrap: anywhere;">a<button>bcde</button>f</span>'
+    expected = '<span style="word-break: keep-all; overflow-wrap: anywhere;">abc<wbr><button>defghi</button><wbr>jkl</span>'
     self.assertEqual(result, expected)
 
   def test_with_nothing_to_split(self) -> None:

diff --git a/tests/test_parser.py b/tests/test_parser.py
@@ -70,17 +70,20 @@ def test_translate_html_string(self) -> None:
     )
 
     input_html = 'xyz<script>alert(1);</script>xyzabc'
+    # TODO: Because the content for skip elements are included, this test tries
+    # to break before "alert". We may want to distinguish "skip from the
+    # content" and "skip breaking" in future.
     expected_html = (
         '<span style="word-break: keep-all; overflow-wrap: anywhere;">'
-        'xyz<script>alert(1);</script>xyz<wbr>abc</span>')
+        'xyz<wbr><script>alert(1);</script>xyz<wbr>abc</span>')
     output_html = p.translate_html_string(input_html)
     self.assertEqual(output_html, expected_html,
                      'Should pass script tags as is.')
 
     input_html = 'xyz<code>abc</code>abc'
     expected_html = (
         '<span style="word-break: keep-all; overflow-wrap: anywhere;">'
-        'xyz<code>abc</code><wbr>abc</span>')
+        'xyz<wbr><code>abc</code><wbr>abc</span>')
     output_html = p.translate_html_string(input_html)
     self.assertEqual(output_html, expected_html,
                      'Should skip some specific tags.')