From f5d1de8f4b555e9bc2af39595b505c4805a08f08 Mon Sep 17 00:00:00 2001 From: Koji Ishii Date: Thu, 16 Nov 2023 00:44:04 +0900 Subject: [PATCH] [java] Replace `wholeText()` with `NodeVisitor` This patch replaces `wholeText()` implemented in https://github.com/google/budoux/pull/367 by a subclass of `NodeVisitor`. Whether the `wholeText()` emits `\n` for `
` depends on the jsoup versions. To ensure that `getText()` always matches what `resolve()` does, this patch changes to its own logic. --- .../java/com/google/budoux/HTMLProcessor.java | 32 +++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/java/src/main/java/com/google/budoux/HTMLProcessor.java b/java/src/main/java/com/google/budoux/HTMLProcessor.java index b2dfeddf..6029c29e 100644 --- a/java/src/main/java/com/google/budoux/HTMLProcessor.java +++ b/java/src/main/java/com/google/budoux/HTMLProcessor.java @@ -56,6 +56,31 @@ private HTMLProcessor() {} } } + /** + * Concatenate all `TextNode`s to a string. Also convert `
+ * ` to `\n`, matching jsoup `Element.wholeText()`. + */ + private static class TextizeNodeVisitor implements NodeVisitor { + private StringBuilder output = new StringBuilder(); + + public String getString() { + return output.toString(); + } + + @Override + public void head(Node node, int depth) { + if (node instanceof Element) { + final String nodeName = node.nodeName(); + final String upperNodeName = nodeName.toUpperCase(Locale.ENGLISH); + if (upperNodeName.equals("BR")) { + output.append('\n'); + } + } else if (node instanceof TextNode) { + output.append(((TextNode) node).getWholeText()); + } + } + } + private static class PhraseResolvingNodeVisitor implements NodeVisitor { private static final char SEP = '\uFFFF'; private final String phrasesJoined; @@ -99,7 +124,7 @@ public void head(Node node, int depth) { final String nodeName = node.nodeName(); final String upperNodeName = nodeName.toUpperCase(Locale.ENGLISH); if (upperNodeName.equals("BR")) { - // Match jsoup `Element.wholeText()` returning `\n` for `
`. + // `
` is converted to `\n`, see `TextizeNodeVisitor.head`. // Assume phrasesJoined.charAt(scanIndex) == '\n'. scanIndex++; } else if (skipNodes.contains(upperNodeName)) { @@ -175,6 +200,9 @@ public static String resolve(List phrases, String html, String separator * @return the text content. */ public static String getText(String html) { - return Jsoup.parseBodyFragment(html).wholeText(); + Document doc = Jsoup.parseBodyFragment(html); + TextizeNodeVisitor nodeVisitor = new TextizeNodeVisitor(); + doc.body().traverse(nodeVisitor); + return nodeVisitor.getString(); } }