google · tushuhei · Jul 27, 2022 · Jul 26, 2022 · Jul 13, 2022 · Jul 27, 2022
diff --git a/README.md b/README.md
@@ -122,7 +122,7 @@ If you want to see help, run `budoux -h`.
 
 ```shellsession
 $ budoux -h
-usage: budoux [-h] [-H] [-m JSON | -l LANG] [-d STR] [-t THRES] [-V] [TXT]
+usage: budoux [-h] [-H] [-m JSON | -l LANG] [-d STR] [-V] [TXT]
 
 BudouX is the successor to Budou,
 the machine learning powered line break organizer tool.
@@ -136,7 +136,6 @@ options:
   -m JSON, --model JSON    custom model file path (default: /path/to/models/ja-knbc.json)
   -l LANG, --lang LANG     language of custom model (default: None)
   -d STR, --delim STR      output delimiter in TEXT mode (default: ---)
-  -t THRES, --thres THRES  threshold value to separate chunks (default: 1000)
   -V, --version            show program's version number and exit
 
 supported languages of `-l`, `--lang`:

diff --git a/budoux/__init__.py b/budoux/__init__.py
@@ -20,4 +20,3 @@
 Parser = parser.Parser
 load_default_japanese_parser = parser.load_default_japanese_parser
 load_default_simplified_chinese_parser = parser.load_default_simplified_chinese_parser
-DEFAULT_THRES = parser.DEFAULT_THRES
diff --git a/budoux/main.py b/budoux/main.py
@@ -147,13 +147,6 @@ def parse_args(test: ArgList = None) -> argparse.Namespace:
       default="---",
       help="output delimiter in TEXT mode",
   )
-  parser.add_argument(
-      "-t",
-      "--thres",
-      type=int,
-      default=budoux.DEFAULT_THRES,
-      help="threshold value to separate chunks",
-  )
   parser.add_argument(
       "-V",
       "--version",
@@ -178,13 +171,13 @@ def _main(test: ArgList = None) -> str:
       inputs_html = sys.stdin.read()
     else:
       inputs_html = args.text
-    res = parser.translate_html_string(inputs_html, thres=args.thres)
+    res = parser.translate_html_string(inputs_html)
   else:
     if args.text is None:
       inputs = [v.rstrip() for v in sys.stdin.readlines()]
     else:
       inputs = [v.rstrip() for v in args.text.splitlines()]
-    outputs = [parser.parse(sentence, thres=args.thres) for sentence in inputs]
+    outputs = [parser.parse(sentence) for sentence in inputs]
     conbined_output = ["\n".join(output) for output in outputs]
     ors = "\n" + args.delim + "\n"
     res = ors.join(conbined_output)

diff --git a/budoux/models/ja-knbc.json b/budoux/models/ja-knbc.json
diff --git a/budoux/models/zh-hans.json b/budoux/models/zh-hans.json
diff --git a/budoux/parser.py b/budoux/parser.py
@@ -23,7 +23,6 @@
 
 MODEL_DIR = os.path.join(os.path.dirname(__file__), 'models')
 PARENT_CSS_STYLE = 'word-break: keep-all; overflow-wrap: break-word;'
-DEFAULT_THRES = 1000
 with open(os.path.join(os.path.dirname(__file__), 'skip_nodes.json')) as f:
   SKIP_NODES: typing.Set[str] = set(json.load(f))
 
@@ -111,14 +110,11 @@ def __init__(self, model: typing.Dict[str, int]):
     """
     self.model = model
 
-  def parse(self,
-            sentence: str,
-            thres: int = DEFAULT_THRES) -> typing.List[str]:
+  def parse(self, sentence: str) -> typing.List[str]:
     """Parses the input sentence and returns a list of semantic chunks.
 
     Args:
       sentence (str): An input sentence.
-      thres (int, optional): A score to control the granularity of chunks.
 
     Returns:
       A list of semantic chunks (List[str]).
@@ -129,18 +125,16 @@ def parse(self,
     p2 = Result.UNKNOWN.value
     p3 = Result.UNKNOWN.value
     chunks = [sentence[0]]
+    base_score = -sum(self.model.values())
     for i in range(1, len(sentence)):
       feature = get_feature(
           sentence[i - 3] if i > 2 else INVALID,
           sentence[i - 2] if i > 1 else INVALID, sentence[i - 1], sentence[i],
           sentence[i + 1] if i + 1 < len(sentence) else INVALID,
           sentence[i + 2] if i + 2 < len(sentence) else INVALID, p1, p2, p3)
-      score = 0
-      for f in feature:
-        if f not in self.model:
-          continue
-        score += self.model[f]
-      if score > thres:
+      score = base_score + 2 * sum(
+          self.model[f] for f in feature if f in self.model)
+      if score > 0:
         chunks.append(sentence[i])
       else:
         chunks[-1] += sentence[i]
@@ -150,12 +144,11 @@ def parse(self,
       p3 = p
     return chunks
 
-  def translate_html_string(self, html: str, thres: int = DEFAULT_THRES) -> str:
+  def translate_html_string(self, html: str) -> str:
     """Translates the given HTML string with markups for semantic line breaks.
 
     Args:
       html (str): An input html string.
-      threshold (int, optional): A score to control the granularity of chunks.
 
     Returns:
       The translated HTML string (str).
@@ -164,7 +157,7 @@ def translate_html_string(self, html: str, thres: int = DEFAULT_THRES) -> str:
     text_content_extractor = TextContentExtractor()
     text_content_extractor.feed(html)
     text_content = text_content_extractor.output
-    chunks = self.parse(text_content, thres)
+    chunks = self.parse(text_content)
     resolver = HTMLChunkResolver(chunks)
     resolver.feed(html)
     return '<span style="%s">%s</span>' % (PARENT_CSS_STYLE, resolver.output)

diff --git a/demo/package-lock.json b/demo/package-lock.json
diff --git a/demo/src/app.ts b/demo/src/app.ts
@@ -27,7 +27,6 @@ const defaultInputs = new Map([
 const inputTextElement = document.getElementById('input') as HTMLTextAreaElement;
 const outputContainerElement = document.getElementById('output') as HTMLElement;
 const fontSizeElement = document.getElementById('fontsize') as HTMLInputElement;
-const thresholdElement = document.getElementById('threshold') as HTMLInputElement;
 const brCheckElement = document.getElementById('wbr2br') as HTMLInputElement;
 const modelSelectElement = document.getElementById('model') as HTMLSelectElement;
 const url = new URL(document.location.href);
@@ -46,11 +45,10 @@ declare global {
  */
 const run = () => {
   outputContainerElement.innerHTML = window.DOMPurify.sanitize(inputTextElement.value);
-  const threshold = Number(thresholdElement.value);
   const model = modelSelectElement.value;
   const parser = parsers.get(model);
   if (!parser) return;
-  parser.applyElement(outputContainerElement, threshold);
+  parser.applyElement(outputContainerElement);
   outputContainerElement.style.fontSize = `${fontSizeElement.value}rem`;
   const renderWithBR = brCheckElement.checked;
   if (renderWithBR) {
@@ -78,10 +76,6 @@ inputTextElement.addEventListener('input', () => {
   run();
 });
 
-thresholdElement.addEventListener('input', () => {
-  run();
-});
-
 brCheckElement.addEventListener('input', () => {
   run();
 });

diff --git a/demo/static/index.html b/demo/static/index.html
@@ -80,10 +80,6 @@ <h1>BudouX 🍇</h1>
         <label for="fontsize">Font size</label>
         <input type="range" id="fontsize" min="1" max="8" step="0.1" value="3">
       </div>
-      <div class="form-item">
-        <label for="threshold">Threshold</label>
-        <input type="range" id="threshold" min="100" max="2000" step="10" value="1000">
-      </div>
       <div class="form-item">
         <input type="checkbox" id="wbr2br">
         <label for="wbr2br">Replace WBR with BR</label>

diff --git a/javascript/README.md b/javascript/README.md
@@ -149,7 +149,7 @@ If you want to see help, run `budoux -h`.
 
 ```shellsession
 $ budoux -h
-Usage: budoux [-h] [-H] [-d STR] [-t THRES] [-m JSON] [-V] [TXT]
+Usage: budoux [-h] [-H] [-d STR] [-m JSON] [-V] [TXT]
 
 BudouX is the successor to Budou, the machine learning powered line break organizer tool.
 
@@ -159,18 +159,11 @@ Arguments:
 Options:
   -H, --html            HTML mode (default: false)
   -d, --delim <str>     output delimiter in TEXT mode (default: "---")
-  -t, --thres <number>  threshold value to separate chunks (default: "1000")
   -m, --model <json>    custom model file path
   -V, --version         output the version number
   -h, --help            display help for command
 ```
 
-### Attributes
-
-- thres
-  - The threshold value to control the granularity of output chunks.
-    Smaller value returns more granular chunks. (default: 1000).
-
 ## Caveat
 
 BudouX supports HTML inputs and outputs HTML strings with markup applied to wrap

diff --git a/javascript/package-lock.json b/javascript/package-lock.json
diff --git a/javascript/src/cli.ts b/javascript/src/cli.ts
@@ -18,7 +18,7 @@ import {readFileSync} from 'fs';
 import {resolve} from 'path';
 import * as readline from 'readline';
 import {Command} from 'commander';
-import {Parser, loadDefaultJapaneseParser, DEFAULT_THRES} from './parser';
+import {Parser, loadDefaultJapaneseParser} from './parser';
 
 /**
  * Run the command line interface program.
@@ -34,11 +34,6 @@ export const cli = (argv: string[]) => {
   program
     .option('-H, --html', 'HTML mode', false)
     .option('-d, --delim <str>', 'output delimiter in TEXT mode', '---')
-    .option(
-      '-t, --thres <number>',
-      'threshold value to separate chunks',
-      `${DEFAULT_THRES}`
-    )
     .option('-m, --model <json>', 'custom model file path')
     .argument('[txt]', 'text');
 
@@ -47,10 +42,9 @@ export const cli = (argv: string[]) => {
   program.parse(argv);
 
   const options = program.opts();
-  const {model, thres, delim, html} = options as {
+  const {model, delim, html} = options as {
     html: boolean;
     delim: string;
-    thres: number;
     model?: string;
   };
   const {args} = program;
@@ -68,12 +62,12 @@ export const cli = (argv: string[]) => {
         stdin += line + '\n';
       });
       process.stdin.on('end', () => {
-        outputParsedTexts(parser, html, delim, thres, [stdin]);
+        outputParsedTexts(parser, html, delim, [stdin]);
       });
       break;
     }
     case 1: {
-      outputParsedTexts(parser, html, delim, thres, args);
+      outputParsedTexts(parser, html, delim, args);
       break;
     }
     default: {
@@ -89,26 +83,24 @@ export const cli = (argv: string[]) => {
  * @param parser A parser.
  * @param html A flag of html output mode.
  * @param delim A delimiter to separate output sentence.
- * @param thres A threshold value to separate chunks.
  * @param args string array to parse. Array should have only one element.
  */
 const outputParsedTexts = (
   parser: Parser,
   html: boolean,
   delim: string,
-  thres: number,
   args: string[]
 ) => {
   if (html) {
     const text = args[0];
-    const output = parser.translateHTMLString(text, thres);
+    const output = parser.translateHTMLString(text);
     console.log(output);
   } else {
     const splitedTextsByNewLine = args[0]
       .split(/\r?\n/)
       .filter(text => text !== '');
     splitedTextsByNewLine.forEach((text, index) => {
-      const parsedTexts = parser.parse(text, thres);
+      const parsedTexts = parser.parse(text);
       parsedTexts.forEach(parsedText => {
         console.log(parsedText);
       });

diff --git a/javascript/src/html_processor.ts b/javascript/src/html_processor.ts
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-import {Parser, DEFAULT_THRES} from './parser';
+import {Parser} from './parser';
 
 const assert = console.assert;
 
@@ -220,11 +220,6 @@ export interface HTMLProcessorOptions {
    * The default value is U+200B ZERO WIDTH SPACE.
    */
   separator?: string | Node;
-  /**
-   * The threshold score to control the granularity of chunks.
-   * See {@link Parser.parse}.
-   */
-  threshold?: number;
 }
 
 /**
@@ -245,8 +240,6 @@ export class HTMLProcessor {
   className?: string;
   /** See {@link HTMLProcessorOptions.separator}. */
   separator: string | Node = ZWSP;
-  /** See {@link HTMLProcessorOptions.threshold}. */
-  threshold: number = DEFAULT_THRES;
 
   /**
    * @param parser A BudouX {@link Parser} to compute semantic line breaks.
@@ -256,7 +249,6 @@ export class HTMLProcessor {
     if (options !== undefined) {
       if (options.className !== undefined) this.className = options.className;
       if (options.separator !== undefined) this.separator = options.separator;
-      if (options.threshold !== undefined) this.threshold = options.threshold;
     }
   }
 
@@ -339,7 +331,7 @@ export class HTMLProcessor {
     if (/^\s*$/.test(text)) return;
 
     // Split the text into a list of phrases.
-    const phrases = this.parser_.parse(text, this.threshold);
+    const phrases = this.parser_.parse(text);
     assert(phrases.length > 0);
     assert(
       phrases.reduce((sum, phrase) => sum + phrase.length, 0) === text.length