diff --git a/java/src/main/java/com/google/budoux/HTMLProcessor.java b/java/src/main/java/com/google/budoux/HTMLProcessor.java index b2dfeddf..6029c29e 100644 --- a/java/src/main/java/com/google/budoux/HTMLProcessor.java +++ b/java/src/main/java/com/google/budoux/HTMLProcessor.java @@ -56,6 +56,31 @@ private HTMLProcessor() {} } } + /** + * Concatenate all `TextNode`s to a string. Also convert `
+ * ` to `\n`, matching jsoup `Element.wholeText()`. + */ + private static class TextizeNodeVisitor implements NodeVisitor { + private StringBuilder output = new StringBuilder(); + + public String getString() { + return output.toString(); + } + + @Override + public void head(Node node, int depth) { + if (node instanceof Element) { + final String nodeName = node.nodeName(); + final String upperNodeName = nodeName.toUpperCase(Locale.ENGLISH); + if (upperNodeName.equals("BR")) { + output.append('\n'); + } + } else if (node instanceof TextNode) { + output.append(((TextNode) node).getWholeText()); + } + } + } + private static class PhraseResolvingNodeVisitor implements NodeVisitor { private static final char SEP = '\uFFFF'; private final String phrasesJoined; @@ -99,7 +124,7 @@ public void head(Node node, int depth) { final String nodeName = node.nodeName(); final String upperNodeName = nodeName.toUpperCase(Locale.ENGLISH); if (upperNodeName.equals("BR")) { - // Match jsoup `Element.wholeText()` returning `\n` for `
`. + // `
` is converted to `\n`, see `TextizeNodeVisitor.head`. // Assume phrasesJoined.charAt(scanIndex) == '\n'. scanIndex++; } else if (skipNodes.contains(upperNodeName)) { @@ -175,6 +200,9 @@ public static String resolve(List phrases, String html, String separator * @return the text content. */ public static String getText(String html) { - return Jsoup.parseBodyFragment(html).wholeText(); + Document doc = Jsoup.parseBodyFragment(html); + TextizeNodeVisitor nodeVisitor = new TextizeNodeVisitor(); + doc.body().traverse(nodeVisitor); + return nodeVisitor.getString(); } }