From a38f629dc354fa76860f7124f44843dff3a08137 Mon Sep 17 00:00:00 2001
From: Koji Ishii <kojii@chromium.org>
Date: Thu, 17 Aug 2023 10:27:26 +0900
Subject: [PATCH] Support non-breaking content in Python (#251)

This patch supports non-breaking content in Python.

In Java and Python implementations, the "Skip" operation includes the
skipped content to the BudouX parser, so no changes to the text for the
parser is needed.

This patch changes following items:
1. Changed `to_skip` to a stack of elements, rather than always reset
   to `False` at the end of an element.
2. When there's a phrase boundary right before the "skip" element,
   insert a break before the "skip" element.

Note `<NOBR>` is added to `skip_nodes.json` at:
https://github.com/google/budoux/pull/248.
---
 budoux/html_processor.py     | 11 +++++++++--
 tests/test_html_processor.py | 13 ++++++++++---
 tests/test_parser.py         |  7 +++++--
 3 files changed, 24 insertions(+), 7 deletions(-)
diff --git a/budoux/html_processor.py b/budoux/html_processor.py
index 5fb859ca..dc8ba15a 100644
--- a/budoux/html_processor.py
+++ b/budoux/html_processor.py
@@ -15,6 +15,7 @@
 
 import json
 import os
+import queue
 import typing
 from html.parser import HTMLParser
 
@@ -58,6 +59,7 @@ def __init__(self, chunks: typing.List[str]):
     self.chunks_joined = SEP.join(chunks)
     self.to_skip = False
     self.scan_index = 0
+    self.element_stack: queue.LifoQueue[bool] = queue.LifoQueue()
 
   def handle_starttag(self, tag: str, attrs: HTMLAttr) -> None:
     attr_pairs = []
@@ -67,12 +69,17 @@ def handle_starttag(self, tag: str, attrs: HTMLAttr) -> None:
       else:
         attr_pairs.append(' %s="%s"' % (attr[0], attr[1]))
     encoded_attrs = ''.join(attr_pairs)
+    self.element_stack.put(self.to_skip)
+    if tag.upper() in SKIP_NODES:
+      if not self.to_skip and self.chunks_joined[self.scan_index] == SEP:
+        self.scan_index += 1
+        self.output += '<wbr>'
+      self.to_skip = True
     self.output += '<%s%s>' % (tag, encoded_attrs)
-    self.to_skip = tag.upper() in SKIP_NODES
 
   def handle_endtag(self, tag: str) -> None:
     self.output += '</%s>' % (tag)
-    self.to_skip = False
+    self.to_skip = self.element_stack.get(block=False)
 
   def handle_data(self, data: str) -> None:
     for char in data:
diff --git a/tests/test_html_processor.py b/tests/test_html_processor.py
index 5d769ac6..8963bb94 100644
--- a/tests/test_html_processor.py
+++ b/tests/test_html_processor.py
@@ -64,10 +64,17 @@ def test_with_standard_html_input(self) -> None:
     self.assertEqual(result, expected)
 
   def test_with_nodes_to_skip(self) -> None:
-    chunks = ['abc', 'def']
-    html = "a<button>bcde</button>f"
+    chunks = ['abc', 'def', 'ghi']
+    html = "a<button>bcde</button>fghi"
+    result = html_processor.resolve(chunks, html)
+    expected = '<span style="word-break: keep-all; overflow-wrap: anywhere;">a<button>bcde</button>f<wbr>ghi</span>'
+    self.assertEqual(result, expected)
+
+  def test_with_break_before_skip(self) -> None:
+    chunks = ['abc', 'def', 'ghi', 'jkl']
+    html = "abc<button>defghi</button>jkl"
     result = html_processor.resolve(chunks, html)
-    expected = '<span style="word-break: keep-all; overflow-wrap: anywhere;">a<button>bcde</button>f</span>'
+    expected = '<span style="word-break: keep-all; overflow-wrap: anywhere;">abc<wbr><button>defghi</button><wbr>jkl</span>'
     self.assertEqual(result, expected)
 
   def test_with_nothing_to_split(self) -> None:
diff --git a/tests/test_parser.py b/tests/test_parser.py
index baa0d239..af13230c 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -70,9 +70,12 @@ def test_translate_html_string(self) -> None:
     )
 
     input_html = 'xyz<script>alert(1);</script>xyzabc'
+    # TODO: Because the content for skip elements are included, this test tries
+    # to break before "alert". We may want to distinguish "skip from the
+    # content" and "skip breaking" in future.
     expected_html = (
         '<span style="word-break: keep-all; overflow-wrap: anywhere;">'
-        'xyz<script>alert(1);</script>xyz<wbr>abc</span>')
+        'xyz<wbr><script>alert(1);</script>xyz<wbr>abc</span>')
     output_html = p.translate_html_string(input_html)
     self.assertEqual(output_html, expected_html,
                      'Should pass script tags as is.')
@@ -80,7 +83,7 @@ def test_translate_html_string(self) -> None:
     input_html = 'xyz<code>abc</code>abc'
     expected_html = (
         '<span style="word-break: keep-all; overflow-wrap: anywhere;">'
-        'xyz<code>abc</code><wbr>abc</span>')
+        'xyz<wbr><code>abc</code><wbr>abc</span>')
     output_html = p.translate_html_string(input_html)
     self.assertEqual(output_html, expected_html,
                      'Should skip some specific tags.')