From 5221f6309bfa77362fa68614407f5d9c02b8dd9e Mon Sep 17 00:00:00 2001 From: Shuhei Iitsuka Date: Fri, 17 Mar 2023 15:15:21 +0900 Subject: [PATCH] Java style fix (#132) * Specify Charset in InputStreamReader * Make HTMLProcessor non instantiable * Use StringBuilder instead of StringBuffer * Style fix --- .../java/com/google/budoux/HTMLProcessor.java | 42 ++++++++++----- .../main/java/com/google/budoux/Parser.java | 52 +++++++++++++------ 2 files changed, 64 insertions(+), 30 deletions(-) diff --git a/java/src/main/java/com/google/budoux/HTMLProcessor.java b/java/src/main/java/com/google/budoux/HTMLProcessor.java index 71999cf1..6fdc8b27 100644 --- a/java/src/main/java/com/google/budoux/HTMLProcessor.java +++ b/java/src/main/java/com/google/budoux/HTMLProcessor.java @@ -17,12 +17,17 @@ package com.google.budoux; import com.google.gson.Gson; +import com.google.gson.JsonIOException; +import com.google.gson.JsonSyntaxException; +import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.HashSet; import java.util.List; +import java.util.Locale; import java.util.Set; import java.util.stream.Collectors; import org.jsoup.Jsoup; @@ -37,45 +42,56 @@ final class HTMLProcessor { private static final Set skipNodes; private static final String STYLE = "word-break: keep-all; overflow-wrap: break-word;"; + private HTMLProcessor() {} + static { Gson gson = new Gson(); InputStream inputStream = HTMLProcessor.class.getResourceAsStream("/skip_nodes.json"); - Reader reader = new InputStreamReader(inputStream); - String[] skipNodesStrings = gson.fromJson(reader, String[].class); - skipNodes = new HashSet<>(Arrays.asList(skipNodesStrings)); + try (Reader reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8)) { + String[] skipNodesStrings = gson.fromJson(reader, String[].class); + skipNodes = new HashSet<>(Arrays.asList(skipNodesStrings)); + } catch (JsonSyntaxException | JsonIOException | IOException e) { + throw new RuntimeException(e); + } } private static class PhraseResolvingNodeVisitor implements NodeVisitor { private static final char SEP = '\uFFFF'; private final String phrasesJoined; + private final StringBuilder output = new StringBuilder(); private Integer scanIndex = 0; - private StringBuffer output = new StringBuffer(); private boolean toSkip = false; PhraseResolvingNodeVisitor(List phrases) { this.phrasesJoined = String.join(Character.toString(SEP), phrases); } - public StringBuffer getOutput() { + public StringBuilder getOutput() { return output; } @Override public void head(Node node, int depth) { - if (node.nodeName() == "body") return; + if (node.nodeName().equals("body")) { + return; + } if (node instanceof Element) { String attributesEncoded = node.attributes().asList().stream() - .map(attribute -> " " + attribute.toString()) + .map(attribute -> " " + attribute) .collect(Collectors.joining("")); output.append(String.format("<%s%s>", node.nodeName(), attributesEncoded)); - if (skipNodes.contains(node.nodeName().toUpperCase())) toSkip = true; + if (skipNodes.contains(node.nodeName().toUpperCase(Locale.ENGLISH))) { + toSkip = true; + } } else if (node instanceof TextNode) { String data = ((TextNode) node).getWholeText(); for (int i = 0; i < data.length(); i++) { char c = data.charAt(i); if (c != phrasesJoined.charAt(scanIndex)) { - if (!toSkip) output.append(""); + if (!toSkip) { + output.append(""); + } scanIndex++; } scanIndex++; @@ -86,8 +102,9 @@ public void head(Node node, int depth) { @Override public void tail(Node node, int depth) { - if (node.nodeName() == "body") return; - if (node instanceof TextNode) return; + if (node.nodeName().equals("body") || node instanceof TextNode) { + return; + } output.append(String.format("", node.nodeName())); } } @@ -103,8 +120,7 @@ public static String resolve(List phrases, String html) { Document doc = Jsoup.parseBodyFragment(html); PhraseResolvingNodeVisitor nodeVisitor = new PhraseResolvingNodeVisitor(phrases); doc.body().traverse(nodeVisitor); - String result = String.format("%s", STYLE, nodeVisitor.getOutput()); - return result; + return String.format("%s", STYLE, nodeVisitor.getOutput()); } /** diff --git a/java/src/main/java/com/google/budoux/Parser.java b/java/src/main/java/com/google/budoux/Parser.java index 355c3f92..383f45a9 100644 --- a/java/src/main/java/com/google/budoux/Parser.java +++ b/java/src/main/java/com/google/budoux/Parser.java @@ -25,6 +25,7 @@ import java.io.InputStreamReader; import java.io.Reader; import java.lang.reflect.Type; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -93,14 +94,12 @@ public static Parser loadByFileName(String modelFileName) { Gson gson = new Gson(); Type type = new TypeToken>>() {}.getType(); InputStream inputStream = Parser.class.getResourceAsStream(modelFileName); - try (Reader reader = new InputStreamReader(inputStream, "UTF-8")) { + try (Reader reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8)) { Map> model = gson.fromJson(reader, type); - Parser parser = new Parser(model); - return parser; + return new Parser(model); } catch (JsonIOException | JsonSyntaxException | IOException e) { - e.printStackTrace(); + throw new RuntimeException(e); } - return null; } /** @@ -123,7 +122,9 @@ private int getScore(String featureKey, String sequence) { * @return a list of phrases. */ public List parse(String sentence) { - if (sentence.equals("")) return new ArrayList<>(); + if (sentence.isEmpty()) { + return new ArrayList<>(); + } List result = new ArrayList<>(); result.add(String.valueOf(sentence.charAt(0))); int totalScore = @@ -132,25 +133,42 @@ public List parse(String sentence) { .sum(); for (int i = 1; i < sentence.length(); i++) { int score = -totalScore; - if (i - 2 > 0) score += 2 * this.getScore("UW1", sentence.substring(i - 3, i - 2)); - if (i - 1 > 0) score += 2 * this.getScore("UW2", sentence.substring(i - 2, i - 1)); + if (i - 2 > 0) { + score += 2 * this.getScore("UW1", sentence.substring(i - 3, i - 2)); + } + if (i - 1 > 0) { + score += 2 * this.getScore("UW2", sentence.substring(i - 2, i - 1)); + } score += 2 * this.getScore("UW3", sentence.substring(i - 1, i)); score += 2 * this.getScore("UW4", sentence.substring(i, i + 1)); - if (i + 1 < sentence.length()) + if (i + 1 < sentence.length()) { score += 2 * this.getScore("UW5", sentence.substring(i + 1, i + 2)); - if (i + 2 < sentence.length()) + } + if (i + 2 < sentence.length()) { score += 2 * this.getScore("UW6", sentence.substring(i + 2, i + 3)); - if (i > 1) score += 2 * this.getScore("BW1", sentence.substring(i - 2, i)); + } + if (i > 1) { + score += 2 * this.getScore("BW1", sentence.substring(i - 2, i)); + } score += 2 * this.getScore("BW2", sentence.substring(i - 1, i + 1)); - if (i + 1 < sentence.length()) + if (i + 1 < sentence.length()) { score += 2 * this.getScore("BW3", sentence.substring(i, i + 2)); - if (i - 2 > 0) score += 2 * this.getScore("TW1", sentence.substring(i - 3, i)); - if (i - 1 > 0) score += 2 * this.getScore("TW2", sentence.substring(i - 2, i + 1)); - if (i + 1 < sentence.length()) + } + if (i - 2 > 0) { + score += 2 * this.getScore("TW1", sentence.substring(i - 3, i)); + } + if (i - 1 > 0) { + score += 2 * this.getScore("TW2", sentence.substring(i - 2, i + 1)); + } + if (i + 1 < sentence.length()) { score += 2 * this.getScore("TW3", sentence.substring(i - 1, i + 2)); - if (i + 2 < sentence.length()) + } + if (i + 2 < sentence.length()) { score += 2 * this.getScore("TW4", sentence.substring(i, i + 3)); - if (score > 0) result.add(""); + } + if (score > 0) { + result.add(""); + } result.set(result.size() - 1, result.get(result.size() - 1) + sentence.charAt(i)); } return result;