google · tushuhei · Nov 24, 2021 · Nov 24, 2021
diff --git a/scripts/build_model.py b/scripts/build_model.py
@@ -23,41 +23,41 @@
 
 
 def rollup(weights_filename: str, model_filename: str, scale: int = 1000):
-    """Rolls up the weights and outputs a model in JSON with integer scores.
+  """Rolls up the weights and outputs a model in JSON with integer scores.
 
   Args:
     weights_filename (str): A file path for the input weights file.
     model_filename (str): A file path for the output model file.
     scale (int, optional): A scale factor for the output score.
   """
-    decision_trees: typing.Dict[str, float] = dict()
-    with open(weights_filename) as f:
-        for row in f:
-            feature = row.split('\t')[0]
-            score = float(row.split('\t')[1])
-            decision_trees.setdefault(feature, 0)
-            decision_trees[feature] += score
-    with open(model_filename, 'w') as f:
-        decision_trees_intscore = dict(
-            (item[0], int(item[1] * scale)) for item in decision_trees.items())
-        json.dump(decision_trees_intscore, f)
+  decision_trees: typing.Dict[str, float] = dict()
+  with open(weights_filename) as f:
+    for row in f:
+      feature = row.split('\t')[0]
+      score = float(row.split('\t')[1])
+      decision_trees.setdefault(feature, 0)
+      decision_trees[feature] += score
+  with open(model_filename, 'w') as f:
+    decision_trees_intscore = dict(
+        (item[0], int(item[1] * scale)) for item in decision_trees.items())
+    json.dump(decision_trees_intscore, f)
 
 
 def main():
-    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument('weight_file',
-                        help='A file path for the learned weights.')
-    parser.add_argument(
-        '-o',
-        '--outfile',
-        help='A file path to export a model file. (default: model.json)',
-        default='model.json')
-    args = parser.parse_args()
-    weights_filename = args.weight_file
-    model_filename = args.outfile
-    rollup(weights_filename, model_filename)
-    print('Model file is exported as', model_filename)
+  parser = argparse.ArgumentParser(description=__doc__)
+  parser.add_argument(
+      'weight_file', help='A file path for the learned weights.')
+  parser.add_argument(
+      '-o',
+      '--outfile',
+      help='A file path to export a model file. (default: model.json)',
+      default='model.json')
+  args = parser.parse_args()
+  weights_filename = args.weight_file
+  model_filename = args.outfile
+  rollup(weights_filename, model_filename)
+  print('Model file is exported as', model_filename)
 
 
 if __name__ == '__main__':
-    main()
+  main()
diff --git a/scripts/encode_data.py b/scripts/encode_data.py
@@ -18,22 +18,23 @@
 
 
 def main():
-    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument(
-        'source_data',
-        help='''File path of the source training data to extract features.''')
-    parser.add_argument('-o',
-                        '--outfile',
-                        help='''Output file path for the encoded training data.
+  parser = argparse.ArgumentParser(description=__doc__)
+  parser.add_argument(
+      'source_data',
+      help='''File path of the source training data to extract features.''')
+  parser.add_argument(
+      '-o',
+      '--outfile',
+      help='''Output file path for the encoded training data.
             (default: encoded_data.txt)''',
-                        default='encoded_data.txt')
-    args = parser.parse_args()
-    source_filename = args.source_data
-    train_data_filename = args.outfile
-    feature_extractor.process(source_filename, train_data_filename)
-    print('\033[92mEncoded training data is output to: %s\033[0m' %
-          (train_data_filename))
+      default='encoded_data.txt')
+  args = parser.parse_args()
+  source_filename = args.source_data
+  train_data_filename = args.outfile
+  feature_extractor.process(source_filename, train_data_filename)
+  print('\033[92mEncoded training data is output to: %s\033[0m' %
+        (train_data_filename))
 
 
 if __name__ == '__main__':
-    main()
+  main()
diff --git a/scripts/load_knbc.py b/scripts/load_knbc.py
@@ -27,110 +27,116 @@
 
 
 class KNBCHTMLParser(HTMLParser):
-    """Parses the HTML files in the KNBC corpus and outputs the chunks."""
-    def __init__(self, split_tab: bool = True):
-        super().__init__()
-        self.chunks = ['']
-        self.n_rows = 0
-        self.n_cols = 0
-        self.current_word = None
-        self.split_tab = split_tab
-
-    def handle_starttag(self, tag, _):
-        if tag == 'tr':
-            self.n_rows += 1
-            self.n_cols = 0
-            self.current_word = None
-        if tag == 'td':
-            self.n_cols += 1
-
-    def handle_endtag(self, tag):
-        if tag != 'tr': return
-        if (self.n_rows > 2 and self.n_cols == 1
-                and (self.split_tab or self.current_word == '文節区切り')):
-            self.chunks.append('')
-        if self.n_cols == 5:
-            self.chunks[-1] += self.current_word
-
-    def handle_data(self, data):
-        if self.n_cols == 1:
-            self.current_word = data
+  """Parses the HTML files in the KNBC corpus and outputs the chunks."""
+
+  def __init__(self, split_tab: bool = True):
+    super().__init__()
+    self.chunks = ['']
+    self.n_rows = 0
+    self.n_cols = 0
+    self.current_word = None
+    self.split_tab = split_tab
+
+  def handle_starttag(self, tag, _):
+    if tag == 'tr':
+      self.n_rows += 1
+      self.n_cols = 0
+      self.current_word = None
+    if tag == 'td':
+      self.n_cols += 1
+
+  def handle_endtag(self, tag):
+    if tag != 'tr':
+      return
+    if (self.n_rows > 2 and self.n_cols == 1 and
+        (self.split_tab or self.current_word == '文節区切り')):
+      self.chunks.append('')
+    if self.n_cols == 5:
+      self.chunks[-1] += self.current_word
+
+  def handle_data(self, data):
+    if self.n_cols == 1:
+      self.current_word = data
 
 
 def break_before_open_parentheses(chunks: typing.List[str]):
-    """Adds chunk breaks before every open parentheses.
+  """Adds chunk breaks before every open parentheses.
 
   Args:
     chunks (List[str]): Source chunks.
 
   Returns:
     Processed chunks.
   """
-    out: typing.List[str] = []
-    for chunk in chunks:
-        if '（' in chunk:
-            index = chunk.index('（')
-            if index > 0: out.append(chunk[:index])
-            out.append(chunk[index:])
-        else:
-            out.append(chunk)
-    return out
+  out: typing.List[str] = []
+  for chunk in chunks:
+    if '（' in chunk:
+      index = chunk.index('（')
+      if index > 0:
+        out.append(chunk[:index])
+      out.append(chunk[index:])
+    else:
+      out.append(chunk)
+  return out
 
 
 def postprocess(chunks: typing.List[str]):
-    """Applies some processes to modify the extracted chunks.
+  """Applies some processes to modify the extracted chunks.
 
   Args:
     chunks (List[str]): Source chunks.
 
   Returns:
     Processed chunks.
   """
-    chunks = break_before_open_parentheses(chunks)
-    return chunks
+  chunks = break_before_open_parentheses(chunks)
+  return chunks
 
 
 def download_knbc(target_dir: str):
-    """Downloads the KNBC corpus and extracts files.
+  """Downloads the KNBC corpus and extracts files.
 
   Args:
     target_dir: A path to the directory to expand files.
   """
-    os.makedirs(target_dir, exist_ok=True)
-    download_file_path = os.path.join(target_dir, 'knbc.tar.bz2')
-    try:
-        urllib.request.urlretrieve(RESOURCE_URL, download_file_path)
-    except urllib.error.HTTPError:
-        print(f'\033[91mResource unavailable: {RESOURCE_URL}\033[0m')
-        raise
-    with tarfile.open(download_file_path, 'r:bz2') as t:
-        t.extractall(path=target_dir)
+  os.makedirs(target_dir, exist_ok=True)
+  download_file_path = os.path.join(target_dir, 'knbc.tar.bz2')
+  try:
+    urllib.request.urlretrieve(RESOURCE_URL, download_file_path)
+  except urllib.error.HTTPError:
+    print(f'\033[91mResource unavailable: {RESOURCE_URL}\033[0m')
+    raise
+  with tarfile.open(download_file_path, 'r:bz2') as t:
+    t.extractall(path=target_dir)
 
 
 def main():
-    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument('-o',
-                        '--outfile',
-                        help='''File path to output the training data.
+  parser = argparse.ArgumentParser(description=__doc__)
+  parser.add_argument(
+      '-o',
+      '--outfile',
+      help='''File path to output the training data.
             (default: source.txt)''',
-                        default='source.txt')
-    args = parser.parse_args()
-    outfile = args.outfile
-    html_dir = 'data/KNBC_v1.0_090925_utf8/html/'
-    if not os.path.isdir(html_dir):
-        download_knbc('data')
-    with open(outfile, 'w') as f:
-        for file in sorted(os.listdir(html_dir)):
-            if file[-11:] != '-morph.html': continue
-            parser = KNBCHTMLParser(split_tab=False)
-            data = open(os.path.join(html_dir, file)).read()
-            parser.feed(data)
-            chunks = parser.chunks
-            chunks = postprocess(chunks)
-            if len(chunks) < 2: continue
-            f.write(utils.SEP.join(chunks) + '\n')
-    print('\033[92mTraining data is output to: %s\033[0m' % (outfile))
+      default='source.txt')
+  args = parser.parse_args()
+  outfile = args.outfile
+  html_dir = 'data/KNBC_v1.0_090925_utf8/html/'
+  if not os.path.isdir(html_dir):
+    download_knbc('data')
+  with open(outfile, 'w') as f:
+    for file in sorted(os.listdir(html_dir)):
+      if file[-11:] != '-morph.html':
+        continue
+      parser = KNBCHTMLParser(split_tab=False)
+      data = open(os.path.join(html_dir, file)).read()
+      parser.feed(data)
+      chunks = parser.chunks
+      chunks = postprocess(chunks)
+      if len(chunks) < 2:
+        continue
+      f.write(utils.SEP.join(chunks) + '\n')
+  print('\033[92mTraining data is output to: %s\033[0m' % (outfile))
 
 
 if __name__ == '__main__':
-    main()
+  main()