google · tushuhei · Nov 24, 2021 · Nov 19, 2021 · Nov 19, 2021 · Nov 19, 2021
diff --git a/.gitignore b/.gitignore
@@ -18,3 +18,5 @@ javascript/src/data
 source.txt
 encoded_data.txt
 weights.txt
+
+.vscode/
diff --git a/budoux/feature_extractor.py b/budoux/feature_extractor.py
@@ -160,4 +160,4 @@ def process(source_filename: str, entries_filename: str):
         f.write('\t'.join(row) + '\n')
       p1 = p2
       p2 = p3
-      p3 = p
+      p3 = p
diff --git a/budoux/main.py b/budoux/main.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BudouX Script to provide CLI for user."""
+import argparse
+import json
+import os
+import shutil
+import sys
+import textwrap
+import typing
+
+import pkg_resources
+
+import budoux
+
+__version__ = "0.0.1"
+
+
+def check_file(path: str) -> str:
+  """Check if filepath is exist or not.
+
+  Args:
+      path (str): Model path
+
+  Raises:
+      FileNotFoundError: Raise if given path is not exist.
+
+  Returns:
+      str: Model path confirmed its existance.
+  """
+  if os.path.isfile(path):
+    return path
+  else:
+    raise FileNotFoundError("'{}' is not found.".format(path))
+
+
+def parse_args(
+    test: typing.Optional[typing.List[str]] = None) -> argparse.Namespace:
+  """Parse commandline arguments.
+
+  Args:
+      test (typing.Optional[typing.List[str]], optional): Commandline args for testing. Defaults to None.
+
+  Returns:
+      argparse.Namespace: Parsed data of args.
+  """
+  parser = argparse.ArgumentParser(
+      prog="budoux",
+      formatter_class=(lambda prog: argparse.RawDescriptionHelpFormatter(
+          prog,
+          **{
+              "width": shutil.get_terminal_size(fallback=(120, 50)).columns,
+              "max_help_position": 25,
+          },
+      )),
+      description=textwrap.dedent("""\
+        BudouX is the successor to Budou,
+        the machine learning powered line break organizer tool."""),
+  )
+
+  parser.add_argument("text", metavar="TXT", nargs="?", type=str, help="text")
+  parser.add_argument(
+      "-H",
+      "--html",
+      action="store_true",
+      help="HTML mode",
+  )
+  parser.add_argument(
+      "-m",
+      "--model",
+      metavar="JSON",
+      type=check_file,
+      default=pkg_resources.resource_filename(__name__, "models/ja-knbc.json"),
+      help="custom model file path (default: models/ja-knbc.json)",
+  )
+  parser.add_argument(
+      "-d",
+      "--delim",
+      metavar="STR",
+      type=str,
+      default="---",
+      help="output delimiter in TEXT mode (default: '---')",
+  )
+
+  parser.add_argument(
+      "-V",
+      "--version",
+      action="version",
+      version="%(prog)s {}".format(__version__))
+  if test:
+    return parser.parse_args(test)
+  else:
+    return parser.parse_args()
+
+
+def _main():
+  args = parse_args()
+  with open(args.model, "r") as f:
+    model = json.load(f)
+
+  parser = budoux.Parser(model)
+
+  if args.html:
+    if args.text is None:
+      inputs = sys.stdin.read()
+    else:
+      inputs = args.text
+    res = parser.translate_html_string(inputs)
+    print(res)
+  else:
+    if args.text is None:
+      inputs = [v.rstrip() for v in sys.stdin.readlines()]
+    else:
+      inputs = [v.rstrip() for v in args.text.splitlines()]
+    res = ["\n".join(res) for res in map(parser.parse, inputs)]
+    ors = "\n" + args.delim + "\n"
+    print(ors.join(res))
+
+
+def main():
+  try:
+    _main()
+  except KeyboardInterrupt:
+    exit(0)
+
+
+if __name__ == "__main__":
+  main()
diff --git a/budoux/parser.py b/budoux/parser.py
@@ -174,4 +174,4 @@ def load_default_japanese_parser():
   """
   with open(os.path.join(MODEL_DIR, 'ja-knbc.json')) as f:
     model = json.load(f)
-  return Parser(model)
+  return Parser(model)
diff --git a/budoux/utils.py b/budoux/utils.py
@@ -23,4 +23,4 @@ class Result(Enum):
   """An enum to represent the type of inference result."""
   UNKNOWN = 'U'
   POSITIVE = 'B'
-  NEGATIVE = 'O'
+  NEGATIVE = 'O'
diff --git a/scripts/build_model.py b/scripts/build_model.py
@@ -23,39 +23,41 @@
 
 
 def rollup(weights_filename: str, model_filename: str, scale: int = 1000):
-  """Rolls up the weights and outputs a model in JSON with integer scores.
+    """Rolls up the weights and outputs a model in JSON with integer scores.
 
   Args:
     weights_filename (str): A file path for the input weights file.
     model_filename (str): A file path for the output model file.
     scale (int, optional): A scale factor for the output score.
   """
-  decision_trees: typing.Dict[str, float] = dict()
-  with open(weights_filename) as f:
-    for row in f:
-      feature = row.split('\t')[0]
-      score = float(row.split('\t')[1])
-      decision_trees.setdefault(feature, 0)
-      decision_trees[feature] += score
-  with open(model_filename, 'w') as f:
-    decision_trees_intscore = dict(
-      (item[0], int(item[1] * scale)) for item in decision_trees.items())
-    json.dump(decision_trees_intscore, f)
+    decision_trees: typing.Dict[str, float] = dict()
+    with open(weights_filename) as f:
+        for row in f:
+            feature = row.split('\t')[0]
+            score = float(row.split('\t')[1])
+            decision_trees.setdefault(feature, 0)
+            decision_trees[feature] += score
+    with open(model_filename, 'w') as f:
+        decision_trees_intscore = dict(
+            (item[0], int(item[1] * scale)) for item in decision_trees.items())
+        json.dump(decision_trees_intscore, f)
 
 
 def main():
-  parser = argparse.ArgumentParser(description=__doc__)
-  parser.add_argument('weight_file',
-    help='A file path for the learned weights.')
-  parser.add_argument('-o', '--outfile',
-    help='A file path to export a model file. (default: model.json)',
-    default='model.json')
-  args = parser.parse_args()
-  weights_filename = args.weight_file
-  model_filename = args.outfile
-  rollup(weights_filename, model_filename)
-  print('Model file is exported as', model_filename)
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('weight_file',
+                        help='A file path for the learned weights.')
+    parser.add_argument(
+        '-o',
+        '--outfile',
+        help='A file path to export a model file. (default: model.json)',
+        default='model.json')
+    args = parser.parse_args()
+    weights_filename = args.weight_file
+    model_filename = args.outfile
+    rollup(weights_filename, model_filename)
+    print('Model file is exported as', model_filename)
 
 
 if __name__ == '__main__':
-  main()
+    main()
diff --git a/scripts/context.py b/scripts/context.py
@@ -14,6 +14,7 @@
 
 import os
 import sys
+
 LIB_PATH = os.path.join(os.path.dirname(__file__), '..')
 sys.path.insert(0, os.path.abspath(LIB_PATH))
 from budoux import feature_extractor

diff --git a/scripts/encode_data.py b/scripts/encode_data.py
@@ -18,20 +18,22 @@
 
 
 def main():
-  parser = argparse.ArgumentParser(description=__doc__)
-  parser.add_argument('source_data',
-    help='''File path of the source training data to extract features.''')
-  parser.add_argument('-o', '--outfile',
-    help='''Output file path for the encoded training data.
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        'source_data',
+        help='''File path of the source training data to extract features.''')
+    parser.add_argument('-o',
+                        '--outfile',
+                        help='''Output file path for the encoded training data.
             (default: encoded_data.txt)''',
-    default='encoded_data.txt')
-  args = parser.parse_args()
-  source_filename = args.source_data
-  train_data_filename = args.outfile
-  feature_extractor.process(source_filename, train_data_filename)
-  print('\033[92mEncoded training data is output to: %s\033[0m' % (
-        train_data_filename))
+                        default='encoded_data.txt')
+    args = parser.parse_args()
+    source_filename = args.source_data
+    train_data_filename = args.outfile
+    feature_extractor.process(source_filename, train_data_filename)
+    print('\033[92mEncoded training data is output to: %s\033[0m' %
+          (train_data_filename))
 
 
 if __name__ == '__main__':
-  main()
+    main()
-Original file line number
+Diff line change
@@ Expand Up / @@ -160,4 +160,4 @@ def process(source_filename: str, entries_filename: str): @@
             f.write('\t'.join(row) + '\n')
           p1 = p2
           p2 = p3
-          p3 = p
+          p3 = p