From a448046c8a3104d489f8d330ddb81f83078dd3e7 Mon Sep 17 00:00:00 2001 From: Koji Ishii Date: Thu, 17 Aug 2023 10:27:12 +0900 Subject: [PATCH] Support non-breaking content in java (#248) This patch supports non-breaking content in Java. In Java and Python implementations, the "Skip" operation includes the skipped content to the BudouX parser, so no changes to the text for the parser is needed. This patch changes following items: 1. Add `NOBR` to the "skip" element. 2. Fix "skip" is applied only to its descendants. Before this patch, all content following "skip" elements are skipped. 3. When there's a phrase boundary right before the "skip" element, insert a break before the "skip" element. --- budoux/skip_nodes.json | 1 + .../java/com/google/budoux/HTMLProcessor.java | 14 ++++++++++++-- .../com/google/budoux/HTMLProcessorTest.java | 17 ++++++++++++++--- 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/budoux/skip_nodes.json b/budoux/skip_nodes.json index e42224f0..782f51d9 100644 --- a/budoux/skip_nodes.json +++ b/budoux/skip_nodes.json @@ -5,6 +5,7 @@ "IFRAME", "INPUT", "META", + "NOBR", "SCRIPT", "STYLE", "TEXTAREA", diff --git a/java/src/main/java/com/google/budoux/HTMLProcessor.java b/java/src/main/java/com/google/budoux/HTMLProcessor.java index 6818e263..9a8fca27 100644 --- a/java/src/main/java/com/google/budoux/HTMLProcessor.java +++ b/java/src/main/java/com/google/budoux/HTMLProcessor.java @@ -29,6 +29,7 @@ import java.util.List; import java.util.Locale; import java.util.Set; +import java.util.Stack; import java.util.stream.Collectors; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; @@ -61,6 +62,7 @@ private static class PhraseResolvingNodeVisitor implements NodeVisitor { private final StringBuilder output = new StringBuilder(); private Integer scanIndex = 0; private boolean toSkip = false; + private Stack elementStack = new Stack(); PhraseResolvingNodeVisitor(List phrases) { this.phrasesJoined = String.join(Character.toString(SEP), phrases); @@ -76,14 +78,20 @@ public void head(Node node, int depth) { return; } if (node instanceof Element) { + elementStack.push(toSkip); String attributesEncoded = node.attributes().asList().stream() .map(attribute -> " " + attribute) .collect(Collectors.joining("")); - output.append(String.format("<%s%s>", node.nodeName(), attributesEncoded)); - if (skipNodes.contains(node.nodeName().toUpperCase(Locale.ENGLISH))) { + final String nodeName = node.nodeName(); + if (skipNodes.contains(nodeName.toUpperCase(Locale.ENGLISH))) { + if (!toSkip && phrasesJoined.charAt(scanIndex) == SEP) { + output.append(""); + scanIndex++; + } toSkip = true; } + output.append(String.format("<%s%s>", nodeName, attributesEncoded)); } else if (node instanceof TextNode) { String data = ((TextNode) node).getWholeText(); for (int i = 0; i < data.length(); i++) { @@ -105,6 +113,8 @@ public void tail(Node node, int depth) { if (node.nodeName().equals("body") || node instanceof TextNode) { return; } + assert node instanceof Element; + toSkip = elementStack.pop(); output.append(String.format("", node.nodeName())); } } diff --git a/java/src/test/java/com/google/budoux/HTMLProcessorTest.java b/java/src/test/java/com/google/budoux/HTMLProcessorTest.java index 202e630d..65f55055 100644 --- a/java/src/test/java/com/google/budoux/HTMLProcessorTest.java +++ b/java/src/test/java/com/google/budoux/HTMLProcessorTest.java @@ -51,12 +51,23 @@ public void testResolveWithStandardHTMLInput() { @Test public void testResolveWithNodesToSkip() { - List phrases = Arrays.asList("abc", "def"); - String html = "af"; + List phrases = Arrays.asList("abc", "def", "ghi"); + String html = "afghi"; + String result = HTMLProcessor.resolve(phrases, html); + assertEquals( + "afghi", + result); + } + + @Test + public void testResolveWithNodesBreakBeforeSkip() { + List phrases = Arrays.asList("abc", "def", "ghi", "jkl"); + String html = "abcdefghijkl"; String result = HTMLProcessor.resolve(phrases, html); assertEquals( "af", + + " anywhere;\">abcdefghijkl", result); }