Skip to content

Commit

Permalink
[java] Replace wholeText() with NodeVisitor (#369)
Browse files Browse the repository at this point in the history
This patch replaces `wholeText()` implemented in #367 by a subclass of `NodeVisitor`.

Whether the `wholeText()` emits `\n` for `<br>` depends on the jsoup versions. To ensure that `getText()` always matches what `resolve()` does, this patch changes to its own logic.
  • Loading branch information
kojiishi authored Nov 15, 2023
1 parent 465907f commit b84f482
Showing 1 changed file with 32 additions and 5 deletions.
37 changes: 32 additions & 5 deletions java/src/main/java/com/google/budoux/HTMLProcessor.java
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,31 @@ private HTMLProcessor() {}
}
}

/**
* A `NodeVisitor` subclass that concatenates all `TextNode`s to a string.
*
* <p>It also converts `&lt;br>` to `\n`.
*/
private static class TextizeNodeVisitor implements NodeVisitor {
private StringBuilder output = new StringBuilder();

public String getString() {
return output.toString();
}

@Override
public void head(Node node, int depth) {
if (node instanceof Element) {
final String nodeName = node.nodeName();
if (nodeName.equals("br")) {
output.append('\n');
}
} else if (node instanceof TextNode) {
output.append(((TextNode) node).getWholeText());
}
}
}

private static class PhraseResolvingNodeVisitor implements NodeVisitor {
private static final char SEP = '\uFFFF';
private final String phrasesJoined;
Expand Down Expand Up @@ -97,12 +122,11 @@ public void head(Node node, int depth) {
.map(attribute -> " " + attribute)
.collect(Collectors.joining(""));
final String nodeName = node.nodeName();
final String upperNodeName = nodeName.toUpperCase(Locale.ENGLISH);
if (upperNodeName.equals("BR")) {
// Match jsoup `Element.wholeText()` returning `\n` for `<br>`.
if (nodeName.equals("br")) {
// `<br>` is converted to `\n`, see `TextizeNodeVisitor.head`.
// Assume phrasesJoined.charAt(scanIndex) == '\n'.
scanIndex++;
} else if (skipNodes.contains(upperNodeName)) {
} else if (skipNodes.contains(nodeName.toUpperCase(Locale.ENGLISH))) {
if (!toSkip && phrasesJoined.charAt(scanIndex) == SEP) {
output.append(separator);
scanIndex++;
Expand Down Expand Up @@ -175,6 +199,9 @@ public static String resolve(List<String> phrases, String html, String separator
* @return the text content.
*/
public static String getText(String html) {
return Jsoup.parseBodyFragment(html).wholeText();
Document doc = Jsoup.parseBodyFragment(html);
TextizeNodeVisitor nodeVisitor = new TextizeNodeVisitor();
doc.body().traverse(nodeVisitor);
return nodeVisitor.getString();
}
}

0 comments on commit b84f482

Please sign in to comment.