diff --git a/java/src/main/java/com/google/budoux/HTMLProcessor.java b/java/src/main/java/com/google/budoux/HTMLProcessor.java
index b2dfeddf..6029c29e 100644
--- a/java/src/main/java/com/google/budoux/HTMLProcessor.java
+++ b/java/src/main/java/com/google/budoux/HTMLProcessor.java
@@ -56,6 +56,31 @@ private HTMLProcessor() {}
}
}
+ /**
+ * Concatenate all `TextNode`s to a string. Also convert `
+ * ` to `\n`, matching jsoup `Element.wholeText()`.
+ */
+ private static class TextizeNodeVisitor implements NodeVisitor {
+ private StringBuilder output = new StringBuilder();
+
+ public String getString() {
+ return output.toString();
+ }
+
+ @Override
+ public void head(Node node, int depth) {
+ if (node instanceof Element) {
+ final String nodeName = node.nodeName();
+ final String upperNodeName = nodeName.toUpperCase(Locale.ENGLISH);
+ if (upperNodeName.equals("BR")) {
+ output.append('\n');
+ }
+ } else if (node instanceof TextNode) {
+ output.append(((TextNode) node).getWholeText());
+ }
+ }
+ }
+
private static class PhraseResolvingNodeVisitor implements NodeVisitor {
private static final char SEP = '\uFFFF';
private final String phrasesJoined;
@@ -99,7 +124,7 @@ public void head(Node node, int depth) {
final String nodeName = node.nodeName();
final String upperNodeName = nodeName.toUpperCase(Locale.ENGLISH);
if (upperNodeName.equals("BR")) {
- // Match jsoup `Element.wholeText()` returning `\n` for `
`.
+ // `
` is converted to `\n`, see `TextizeNodeVisitor.head`.
// Assume phrasesJoined.charAt(scanIndex) == '\n'.
scanIndex++;
} else if (skipNodes.contains(upperNodeName)) {
@@ -175,6 +200,9 @@ public static String resolve(List phrases, String html, String separator
* @return the text content.
*/
public static String getText(String html) {
- return Jsoup.parseBodyFragment(html).wholeText();
+ Document doc = Jsoup.parseBodyFragment(html);
+ TextizeNodeVisitor nodeVisitor = new TextizeNodeVisitor();
+ doc.body().traverse(nodeVisitor);
+ return nodeVisitor.getString();
}
}