Skip to content

Commit

Permalink
Support non-breaking content in Python (#251)
Browse files Browse the repository at this point in the history
This patch supports non-breaking content in Python.

In Java and Python implementations, the "Skip" operation includes the
skipped content to the BudouX parser, so no changes to the text for the
parser is needed.

This patch changes following items:
1. Changed `to_skip` to a stack of elements, rather than always reset
   to `False` at the end of an element.
2. When there's a phrase boundary right before the "skip" element,
   insert a break before the "skip" element.

Note `<NOBR>` is added to `skip_nodes.json` at:
#248.
  • Loading branch information
kojiishi authored Aug 17, 2023
1 parent a448046 commit a38f629
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 7 deletions.
11 changes: 9 additions & 2 deletions budoux/html_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import json
import os
import queue
import typing
from html.parser import HTMLParser

Expand Down Expand Up @@ -58,6 +59,7 @@ def __init__(self, chunks: typing.List[str]):
self.chunks_joined = SEP.join(chunks)
self.to_skip = False
self.scan_index = 0
self.element_stack: queue.LifoQueue[bool] = queue.LifoQueue()

def handle_starttag(self, tag: str, attrs: HTMLAttr) -> None:
attr_pairs = []
Expand All @@ -67,12 +69,17 @@ def handle_starttag(self, tag: str, attrs: HTMLAttr) -> None:
else:
attr_pairs.append(' %s="%s"' % (attr[0], attr[1]))
encoded_attrs = ''.join(attr_pairs)
self.element_stack.put(self.to_skip)
if tag.upper() in SKIP_NODES:
if not self.to_skip and self.chunks_joined[self.scan_index] == SEP:
self.scan_index += 1
self.output += '<wbr>'
self.to_skip = True
self.output += '<%s%s>' % (tag, encoded_attrs)
self.to_skip = tag.upper() in SKIP_NODES

def handle_endtag(self, tag: str) -> None:
self.output += '</%s>' % (tag)
self.to_skip = False
self.to_skip = self.element_stack.get(block=False)

def handle_data(self, data: str) -> None:
for char in data:
Expand Down
13 changes: 10 additions & 3 deletions tests/test_html_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,17 @@ def test_with_standard_html_input(self) -> None:
self.assertEqual(result, expected)

def test_with_nodes_to_skip(self) -> None:
chunks = ['abc', 'def']
html = "a<button>bcde</button>f"
chunks = ['abc', 'def', 'ghi']
html = "a<button>bcde</button>fghi"
result = html_processor.resolve(chunks, html)
expected = '<span style="word-break: keep-all; overflow-wrap: anywhere;">a<button>bcde</button>f<wbr>ghi</span>'
self.assertEqual(result, expected)

def test_with_break_before_skip(self) -> None:
chunks = ['abc', 'def', 'ghi', 'jkl']
html = "abc<button>defghi</button>jkl"
result = html_processor.resolve(chunks, html)
expected = '<span style="word-break: keep-all; overflow-wrap: anywhere;">a<button>bcde</button>f</span>'
expected = '<span style="word-break: keep-all; overflow-wrap: anywhere;">abc<wbr><button>defghi</button><wbr>jkl</span>'
self.assertEqual(result, expected)

def test_with_nothing_to_split(self) -> None:
Expand Down
7 changes: 5 additions & 2 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,17 +70,20 @@ def test_translate_html_string(self) -> None:
)

input_html = 'xyz<script>alert(1);</script>xyzabc'
# TODO: Because the content for skip elements are included, this test tries
# to break before "alert". We may want to distinguish "skip from the
# content" and "skip breaking" in future.
expected_html = (
'<span style="word-break: keep-all; overflow-wrap: anywhere;">'
'xyz<script>alert(1);</script>xyz<wbr>abc</span>')
'xyz<wbr><script>alert(1);</script>xyz<wbr>abc</span>')
output_html = p.translate_html_string(input_html)
self.assertEqual(output_html, expected_html,
'Should pass script tags as is.')

input_html = 'xyz<code>abc</code>abc'
expected_html = (
'<span style="word-break: keep-all; overflow-wrap: anywhere;">'
'xyz<code>abc</code><wbr>abc</span>')
'xyz<wbr><code>abc</code><wbr>abc</span>')
output_html = p.translate_html_string(input_html)
self.assertEqual(output_html, expected_html,
'Should skip some specific tags.')
Expand Down

0 comments on commit a38f629

Please sign in to comment.