diff --git a/budoux/skip_nodes.json b/budoux/skip_nodes.json index e42224f0..782f51d9 100644 --- a/budoux/skip_nodes.json +++ b/budoux/skip_nodes.json @@ -5,6 +5,7 @@ "IFRAME", "INPUT", "META", + "NOBR", "SCRIPT", "STYLE", "TEXTAREA", diff --git a/java/src/main/java/com/google/budoux/HTMLProcessor.java b/java/src/main/java/com/google/budoux/HTMLProcessor.java index 6818e263..9a8fca27 100644 --- a/java/src/main/java/com/google/budoux/HTMLProcessor.java +++ b/java/src/main/java/com/google/budoux/HTMLProcessor.java @@ -29,6 +29,7 @@ import java.util.List; import java.util.Locale; import java.util.Set; +import java.util.Stack; import java.util.stream.Collectors; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; @@ -61,6 +62,7 @@ private static class PhraseResolvingNodeVisitor implements NodeVisitor { private final StringBuilder output = new StringBuilder(); private Integer scanIndex = 0; private boolean toSkip = false; + private Stack elementStack = new Stack(); PhraseResolvingNodeVisitor(List phrases) { this.phrasesJoined = String.join(Character.toString(SEP), phrases); @@ -76,14 +78,20 @@ public void head(Node node, int depth) { return; } if (node instanceof Element) { + elementStack.push(toSkip); String attributesEncoded = node.attributes().asList().stream() .map(attribute -> " " + attribute) .collect(Collectors.joining("")); - output.append(String.format("<%s%s>", node.nodeName(), attributesEncoded)); - if (skipNodes.contains(node.nodeName().toUpperCase(Locale.ENGLISH))) { + final String nodeName = node.nodeName(); + if (skipNodes.contains(nodeName.toUpperCase(Locale.ENGLISH))) { + if (!toSkip && phrasesJoined.charAt(scanIndex) == SEP) { + output.append(""); + scanIndex++; + } toSkip = true; } + output.append(String.format("<%s%s>", nodeName, attributesEncoded)); } else if (node instanceof TextNode) { String data = ((TextNode) node).getWholeText(); for (int i = 0; i < data.length(); i++) { @@ -105,6 +113,8 @@ public void tail(Node node, int depth) { if (node.nodeName().equals("body") || node instanceof TextNode) { return; } + assert node instanceof Element; + toSkip = elementStack.pop(); output.append(String.format("", node.nodeName())); } } diff --git a/java/src/test/java/com/google/budoux/HTMLProcessorTest.java b/java/src/test/java/com/google/budoux/HTMLProcessorTest.java index 202e630d..65f55055 100644 --- a/java/src/test/java/com/google/budoux/HTMLProcessorTest.java +++ b/java/src/test/java/com/google/budoux/HTMLProcessorTest.java @@ -51,12 +51,23 @@ public void testResolveWithStandardHTMLInput() { @Test public void testResolveWithNodesToSkip() { - List phrases = Arrays.asList("abc", "def"); - String html = "af"; + List phrases = Arrays.asList("abc", "def", "ghi"); + String html = "afghi"; + String result = HTMLProcessor.resolve(phrases, html); + assertEquals( + "afghi", + result); + } + + @Test + public void testResolveWithNodesBreakBeforeSkip() { + List phrases = Arrays.asList("abc", "def", "ghi", "jkl"); + String html = "abcdefghijkl"; String result = HTMLProcessor.resolve(phrases, html); assertEquals( "af", + + " anywhere;\">abcdefghijkl", result); }