From b84f4820388e98a4149579729376ff893d317050 Mon Sep 17 00:00:00 2001 From: Koji Ishii Date: Thu, 16 Nov 2023 07:06:08 +0900 Subject: [PATCH] [java] Replace `wholeText()` with `NodeVisitor` (#369) This patch replaces `wholeText()` implemented in https://github.com/google/budoux/pull/367 by a subclass of `NodeVisitor`. Whether the `wholeText()` emits `\n` for `
` depends on the jsoup versions. To ensure that `getText()` always matches what `resolve()` does, this patch changes to its own logic. --- .../java/com/google/budoux/HTMLProcessor.java | 37 ++++++++++++++++--- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/java/src/main/java/com/google/budoux/HTMLProcessor.java b/java/src/main/java/com/google/budoux/HTMLProcessor.java index b2dfeddf..0337da58 100644 --- a/java/src/main/java/com/google/budoux/HTMLProcessor.java +++ b/java/src/main/java/com/google/budoux/HTMLProcessor.java @@ -56,6 +56,31 @@ private HTMLProcessor() {} } } + /** + * A `NodeVisitor` subclass that concatenates all `TextNode`s to a string. + * + *

It also converts `<br>` to `\n`. + */ + private static class TextizeNodeVisitor implements NodeVisitor { + private StringBuilder output = new StringBuilder(); + + public String getString() { + return output.toString(); + } + + @Override + public void head(Node node, int depth) { + if (node instanceof Element) { + final String nodeName = node.nodeName(); + if (nodeName.equals("br")) { + output.append('\n'); + } + } else if (node instanceof TextNode) { + output.append(((TextNode) node).getWholeText()); + } + } + } + private static class PhraseResolvingNodeVisitor implements NodeVisitor { private static final char SEP = '\uFFFF'; private final String phrasesJoined; @@ -97,12 +122,11 @@ public void head(Node node, int depth) { .map(attribute -> " " + attribute) .collect(Collectors.joining("")); final String nodeName = node.nodeName(); - final String upperNodeName = nodeName.toUpperCase(Locale.ENGLISH); - if (upperNodeName.equals("BR")) { - // Match jsoup `Element.wholeText()` returning `\n` for `
`. + if (nodeName.equals("br")) { + // `
` is converted to `\n`, see `TextizeNodeVisitor.head`. // Assume phrasesJoined.charAt(scanIndex) == '\n'. scanIndex++; - } else if (skipNodes.contains(upperNodeName)) { + } else if (skipNodes.contains(nodeName.toUpperCase(Locale.ENGLISH))) { if (!toSkip && phrasesJoined.charAt(scanIndex) == SEP) { output.append(separator); scanIndex++; @@ -175,6 +199,9 @@ public static String resolve(List phrases, String html, String separator * @return the text content. */ public static String getText(String html) { - return Jsoup.parseBodyFragment(html).wholeText(); + Document doc = Jsoup.parseBodyFragment(html); + TextizeNodeVisitor nodeVisitor = new TextizeNodeVisitor(); + doc.body().traverse(nodeVisitor); + return nodeVisitor.getString(); } }