Skip to content

Commit

Permalink
Support non-breaking content in java (#248)
Browse files Browse the repository at this point in the history
This patch supports non-breaking content in Java.

In Java and Python implementations, the "Skip" operation includes the
skipped content to the BudouX parser, so no changes to the text for the
parser is needed.

This patch changes following items:
1. Add `NOBR` to the "skip" element.
2. Fix "skip" is applied only to its descendants. Before this patch, all
   content following "skip" elements are skipped.
3. When there's a phrase boundary right before the "skip" element,
   insert a break before the "skip" element.
  • Loading branch information
kojiishi authored Aug 17, 2023
1 parent 613ba71 commit a448046
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 5 deletions.
1 change: 1 addition & 0 deletions budoux/skip_nodes.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"IFRAME",
"INPUT",
"META",
"NOBR",
"SCRIPT",
"STYLE",
"TEXTAREA",
Expand Down
14 changes: 12 additions & 2 deletions java/src/main/java/com/google/budoux/HTMLProcessor.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.Stack;
import java.util.stream.Collectors;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
Expand Down Expand Up @@ -61,6 +62,7 @@ private static class PhraseResolvingNodeVisitor implements NodeVisitor {
private final StringBuilder output = new StringBuilder();
private Integer scanIndex = 0;
private boolean toSkip = false;
private Stack<Boolean> elementStack = new Stack<Boolean>();

PhraseResolvingNodeVisitor(List<String> phrases) {
this.phrasesJoined = String.join(Character.toString(SEP), phrases);
Expand All @@ -76,14 +78,20 @@ public void head(Node node, int depth) {
return;
}
if (node instanceof Element) {
elementStack.push(toSkip);
String attributesEncoded =
node.attributes().asList().stream()
.map(attribute -> " " + attribute)
.collect(Collectors.joining(""));
output.append(String.format("<%s%s>", node.nodeName(), attributesEncoded));
if (skipNodes.contains(node.nodeName().toUpperCase(Locale.ENGLISH))) {
final String nodeName = node.nodeName();
if (skipNodes.contains(nodeName.toUpperCase(Locale.ENGLISH))) {
if (!toSkip && phrasesJoined.charAt(scanIndex) == SEP) {
output.append("<wbr>");
scanIndex++;
}
toSkip = true;
}
output.append(String.format("<%s%s>", nodeName, attributesEncoded));
} else if (node instanceof TextNode) {
String data = ((TextNode) node).getWholeText();
for (int i = 0; i < data.length(); i++) {
Expand All @@ -105,6 +113,8 @@ public void tail(Node node, int depth) {
if (node.nodeName().equals("body") || node instanceof TextNode) {
return;
}
assert node instanceof Element;
toSkip = elementStack.pop();
output.append(String.format("</%s>", node.nodeName()));
}
}
Expand Down
17 changes: 14 additions & 3 deletions java/src/test/java/com/google/budoux/HTMLProcessorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,23 @@ public void testResolveWithStandardHTMLInput() {

@Test
public void testResolveWithNodesToSkip() {
List<String> phrases = Arrays.asList("abc", "def");
String html = "a<button>bcde</button>f";
List<String> phrases = Arrays.asList("abc", "def", "ghi");
String html = "a<button>bcde</button>fghi";
String result = HTMLProcessor.resolve(phrases, html);
assertEquals(
"<span style=\"word-break: keep-all; overflow-wrap:"
+ " anywhere;\">a<button>bcde</button>f<wbr>ghi</span>",
result);
}

@Test
public void testResolveWithNodesBreakBeforeSkip() {
List<String> phrases = Arrays.asList("abc", "def", "ghi", "jkl");
String html = "abc<nobr>defghi</nobr>jkl";
String result = HTMLProcessor.resolve(phrases, html);
assertEquals(
"<span style=\"word-break: keep-all; overflow-wrap:"
+ " anywhere;\">a<button>bcde</button>f</span>",
+ " anywhere;\">abc<wbr><nobr>defghi</nobr><wbr>jkl</span>",
result);
}

Expand Down

0 comments on commit a448046

Please sign in to comment.