From 50178f60352331d065254c456a2a619aadf3281e Mon Sep 17 00:00:00 2001
From: Shuhei Iitsuka <tushuhei@google.com>
Date: Wed, 19 Apr 2023 16:24:10 +0900
Subject: [PATCH] Improve KNBC HTML Parser (#137)

---
 scripts/prepare_knbc.py            | 60 +++++++++++++++++++++---------
 scripts/tests/test_prepare_knbc.py | 30 +++++++++++++++
 2 files changed, 73 insertions(+), 17 deletions(-)
diff --git a/scripts/prepare_knbc.py b/scripts/prepare_knbc.py
index c9ae5419..c6e79162 100644
--- a/scripts/prepare_knbc.py
+++ b/scripts/prepare_knbc.py
@@ -33,36 +33,62 @@
 
 
 class KNBCHTMLParser(HTMLParser):
-  """Parses the HTML files in the KNBC corpus and outputs the chunks."""
+  """Parses the HTML files in the KNBC corpus to collect chunks.
+
+  Attributes:
+    chunks: The collected chunks.
+    row: The current row index.
+    col: The current column index.
+    current_word: The current word to process.
+    on_split_row: Whether the scan is on the splitting row.
+    split_tab: Whether to split by tags in addition to Bunsetsu.
+  """
+
+  BUNSETSU_SPLIT_ID = 'bnst-kugiri'
+  TAG_SPLIT_ID = 'tag-kugiri'
+
+  def __init__(self, split_tab: bool = False) -> None:
+    """Initializes the HTML parser for the KNBC corpus.
 
-  def __init__(self, split_tab: bool = True) -> None:
+    Args:
+      split_tab: Split by tags in addition to Bunsetsu. (default: False)
+    """
     super().__init__()
     self.chunks = ['']
-    self.n_rows = 0
-    self.n_cols = 0
-    self.current_word: typing.Optional[str] = None
+    self.row = 0
+    self.col = 0
+    self.current_word = ''
+    self.on_split_row = False
     self.split_tab = split_tab
 
-  def handle_starttag(self, tag: str, _: typing.Any) -> None:
+  def handle_starttag(
+      self, tag: str,
+      attributes: typing.List[typing.Tuple[str, typing.Optional[str]]]) -> None:
     if tag == 'tr':
-      self.n_rows += 1
-      self.n_cols = 0
-      self.current_word = None
+      self.row += 1
+      self.col = 0
+      self.current_word = ''
+      self.on_split_row = False
+
     if tag == 'td':
-      self.n_cols += 1
+      self.col += 1
+      for name, value in attributes:
+        if (name == 'id' and value == self.BUNSETSU_SPLIT_ID) or (
+            self.split_tab and name == 'id' and value == self.TAG_SPLIT_ID):
+          self.on_split_row = True
 
   def handle_endtag(self, tag: str) -> None:
-    if tag != 'tr':
+    if tag != 'tr':  # Skip all tags but TR.
+      return None
+    if self.row < 3:  # Skip the first two rows.
       return None
-    flag1 = self.n_rows > 2 and self.n_cols == 1
-    flag2 = self.split_tab or self.current_word == '文節区切り'
-    if flag1 and flag2:
-      self.chunks.append('')
-    if self.n_cols == 5 and type(self.current_word) is str:
+    if self.on_split_row:
+      return self.chunks.append('')
+    if self.col == 5:
       self.chunks[-1] += self.current_word
 
   def handle_data(self, data: str) -> None:
-    if self.n_cols == 1:
+    if self.col == 1:
       self.current_word = data
 
 
diff --git a/scripts/tests/test_prepare_knbc.py b/scripts/tests/test_prepare_knbc.py
index cdfbc38c..1aa21519 100644
--- a/scripts/tests/test_prepare_knbc.py
+++ b/scripts/tests/test_prepare_knbc.py
@@ -40,3 +40,33 @@ def test_multiple_hit(self) -> None:
     chunks = ['abcabc', 'def']
     result = prepare_knbc.break_before_sequence(chunks, 'bc')
     self.assertListEqual(result, ['a', 'bca', 'bc', 'def'])
+
+
+class TestKNBCHTMLParser(unittest.TestCase):
+  example_html = '''
+  <html>
+    <body>
+      <table>
+        <tr><th>HA</th><th>HB</th><th>HC</th><th>HD</th><th>HE</th></tr>
+        <tr><td colspan="5" id="bnst-kugiri"><a>文節区切り</a></td></tr>
+        <tr><td>abc</td><td></td><td></td><td></td><td></td></tr>
+        <tr><td>de</td><td></td><td></td><td></td><td></td></tr>
+        <tr><td colspan="5" id="tag-kugiri"><a>タグ区切り</a></td></tr>
+        <tr><td>fgh</td><td></td><td></td><td></td><td> </td></tr>
+        <tr><td>ijkl</td><td></td><td></td><td></td><td> </td></tr>
+        <tr><td colspan="5" id="bnst-kugiri"><a>文節区切り</a></td></tr>
+        <tr><td>mn</td><td></td><td></td><td></td><td> </td></tr>
+      </table>
+    </body>
+  </html>
+  '''
+
+  def test_parse(self) -> None:
+    parser = prepare_knbc.KNBCHTMLParser(False)
+    parser.feed(self.example_html)
+    self.assertListEqual(parser.chunks, ['abcdefghijkl', 'mn'])
+
+  def test_parse_split_tags(self) -> None:
+    parser = prepare_knbc.KNBCHTMLParser(True)
+    parser.feed(self.example_html)
+    self.assertListEqual(parser.chunks, ['abcde', 'fghijkl', 'mn'])