Skip to content

Commit

Permalink
Java style fix (#132)
Browse files Browse the repository at this point in the history
* Specify Charset in InputStreamReader
* Make HTMLProcessor non instantiable
* Use StringBuilder instead of StringBuffer
* Style fix
  • Loading branch information
tushuhei authored Mar 17, 2023
1 parent b068370 commit 5221f63
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 30 deletions.
42 changes: 29 additions & 13 deletions java/src/main/java/com/google/budoux/HTMLProcessor.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,17 @@
package com.google.budoux;

import com.google.gson.Gson;
import com.google.gson.JsonIOException;
import com.google.gson.JsonSyntaxException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.stream.Collectors;
import org.jsoup.Jsoup;
Expand All @@ -37,45 +42,56 @@ final class HTMLProcessor {
private static final Set<String> skipNodes;
private static final String STYLE = "word-break: keep-all; overflow-wrap: break-word;";

private HTMLProcessor() {}

static {
Gson gson = new Gson();
InputStream inputStream = HTMLProcessor.class.getResourceAsStream("/skip_nodes.json");
Reader reader = new InputStreamReader(inputStream);
String[] skipNodesStrings = gson.fromJson(reader, String[].class);
skipNodes = new HashSet<>(Arrays.asList(skipNodesStrings));
try (Reader reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8)) {
String[] skipNodesStrings = gson.fromJson(reader, String[].class);
skipNodes = new HashSet<>(Arrays.asList(skipNodesStrings));
} catch (JsonSyntaxException | JsonIOException | IOException e) {
throw new RuntimeException(e);
}
}

private static class PhraseResolvingNodeVisitor implements NodeVisitor {
private static final char SEP = '\uFFFF';
private final String phrasesJoined;
private final StringBuilder output = new StringBuilder();
private Integer scanIndex = 0;
private StringBuffer output = new StringBuffer();
private boolean toSkip = false;

PhraseResolvingNodeVisitor(List<String> phrases) {
this.phrasesJoined = String.join(Character.toString(SEP), phrases);
}

public StringBuffer getOutput() {
public StringBuilder getOutput() {
return output;
}

@Override
public void head(Node node, int depth) {
if (node.nodeName() == "body") return;
if (node.nodeName().equals("body")) {
return;
}
if (node instanceof Element) {
String attributesEncoded =
node.attributes().asList().stream()
.map(attribute -> " " + attribute.toString())
.map(attribute -> " " + attribute)
.collect(Collectors.joining(""));
output.append(String.format("<%s%s>", node.nodeName(), attributesEncoded));
if (skipNodes.contains(node.nodeName().toUpperCase())) toSkip = true;
if (skipNodes.contains(node.nodeName().toUpperCase(Locale.ENGLISH))) {
toSkip = true;
}
} else if (node instanceof TextNode) {
String data = ((TextNode) node).getWholeText();
for (int i = 0; i < data.length(); i++) {
char c = data.charAt(i);
if (c != phrasesJoined.charAt(scanIndex)) {
if (!toSkip) output.append("<wbr>");
if (!toSkip) {
output.append("<wbr>");
}
scanIndex++;
}
scanIndex++;
Expand All @@ -86,8 +102,9 @@ public void head(Node node, int depth) {

@Override
public void tail(Node node, int depth) {
if (node.nodeName() == "body") return;
if (node instanceof TextNode) return;
if (node.nodeName().equals("body") || node instanceof TextNode) {
return;
}
output.append(String.format("</%s>", node.nodeName()));
}
}
Expand All @@ -103,8 +120,7 @@ public static String resolve(List<String> phrases, String html) {
Document doc = Jsoup.parseBodyFragment(html);
PhraseResolvingNodeVisitor nodeVisitor = new PhraseResolvingNodeVisitor(phrases);
doc.body().traverse(nodeVisitor);
String result = String.format("<span style=\"%s\">%s</span>", STYLE, nodeVisitor.getOutput());
return result;
return String.format("<span style=\"%s\">%s</span>", STYLE, nodeVisitor.getOutput());
}

/**
Expand Down
52 changes: 35 additions & 17 deletions java/src/main/java/com/google/budoux/Parser.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import java.io.InputStreamReader;
import java.io.Reader;
import java.lang.reflect.Type;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -93,14 +94,12 @@ public static Parser loadByFileName(String modelFileName) {
Gson gson = new Gson();
Type type = new TypeToken<Map<String, Map<String, Integer>>>() {}.getType();
InputStream inputStream = Parser.class.getResourceAsStream(modelFileName);
try (Reader reader = new InputStreamReader(inputStream, "UTF-8")) {
try (Reader reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8)) {
Map<String, Map<String, Integer>> model = gson.fromJson(reader, type);
Parser parser = new Parser(model);
return parser;
return new Parser(model);
} catch (JsonIOException | JsonSyntaxException | IOException e) {
e.printStackTrace();
throw new RuntimeException(e);
}
return null;
}

/**
Expand All @@ -123,7 +122,9 @@ private int getScore(String featureKey, String sequence) {
* @return a list of phrases.
*/
public List<String> parse(String sentence) {
if (sentence.equals("")) return new ArrayList<>();
if (sentence.isEmpty()) {
return new ArrayList<>();
}
List<String> result = new ArrayList<>();
result.add(String.valueOf(sentence.charAt(0)));
int totalScore =
Expand All @@ -132,25 +133,42 @@ public List<String> parse(String sentence) {
.sum();
for (int i = 1; i < sentence.length(); i++) {
int score = -totalScore;
if (i - 2 > 0) score += 2 * this.getScore("UW1", sentence.substring(i - 3, i - 2));
if (i - 1 > 0) score += 2 * this.getScore("UW2", sentence.substring(i - 2, i - 1));
if (i - 2 > 0) {
score += 2 * this.getScore("UW1", sentence.substring(i - 3, i - 2));
}
if (i - 1 > 0) {
score += 2 * this.getScore("UW2", sentence.substring(i - 2, i - 1));
}
score += 2 * this.getScore("UW3", sentence.substring(i - 1, i));
score += 2 * this.getScore("UW4", sentence.substring(i, i + 1));
if (i + 1 < sentence.length())
if (i + 1 < sentence.length()) {
score += 2 * this.getScore("UW5", sentence.substring(i + 1, i + 2));
if (i + 2 < sentence.length())
}
if (i + 2 < sentence.length()) {
score += 2 * this.getScore("UW6", sentence.substring(i + 2, i + 3));
if (i > 1) score += 2 * this.getScore("BW1", sentence.substring(i - 2, i));
}
if (i > 1) {
score += 2 * this.getScore("BW1", sentence.substring(i - 2, i));
}
score += 2 * this.getScore("BW2", sentence.substring(i - 1, i + 1));
if (i + 1 < sentence.length())
if (i + 1 < sentence.length()) {
score += 2 * this.getScore("BW3", sentence.substring(i, i + 2));
if (i - 2 > 0) score += 2 * this.getScore("TW1", sentence.substring(i - 3, i));
if (i - 1 > 0) score += 2 * this.getScore("TW2", sentence.substring(i - 2, i + 1));
if (i + 1 < sentence.length())
}
if (i - 2 > 0) {
score += 2 * this.getScore("TW1", sentence.substring(i - 3, i));
}
if (i - 1 > 0) {
score += 2 * this.getScore("TW2", sentence.substring(i - 2, i + 1));
}
if (i + 1 < sentence.length()) {
score += 2 * this.getScore("TW3", sentence.substring(i - 1, i + 2));
if (i + 2 < sentence.length())
}
if (i + 2 < sentence.length()) {
score += 2 * this.getScore("TW4", sentence.substring(i, i + 3));
if (score > 0) result.add("");
}
if (score > 0) {
result.add("");
}
result.set(result.size() - 1, result.get(result.size() - 1) + sentence.charAt(i));
}
return result;
Expand Down

0 comments on commit 5221f63

Please sign in to comment.