Skip to content

Commit

Permalink
Improve KNBC HTML Parser (#137)
Browse files Browse the repository at this point in the history
  • Loading branch information
tushuhei authored Apr 19, 2023
1 parent 181e85e commit 50178f6
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 17 deletions.
60 changes: 43 additions & 17 deletions scripts/prepare_knbc.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,36 +33,62 @@


class KNBCHTMLParser(HTMLParser):
"""Parses the HTML files in the KNBC corpus and outputs the chunks."""
"""Parses the HTML files in the KNBC corpus to collect chunks.
Attributes:
chunks: The collected chunks.
row: The current row index.
col: The current column index.
current_word: The current word to process.
on_split_row: Whether the scan is on the splitting row.
split_tab: Whether to split by tags in addition to Bunsetsu.
"""

BUNSETSU_SPLIT_ID = 'bnst-kugiri'
TAG_SPLIT_ID = 'tag-kugiri'

def __init__(self, split_tab: bool = False) -> None:
"""Initializes the HTML parser for the KNBC corpus.
def __init__(self, split_tab: bool = True) -> None:
Args:
split_tab: Split by tags in addition to Bunsetsu. (default: False)
"""
super().__init__()
self.chunks = ['']
self.n_rows = 0
self.n_cols = 0
self.current_word: typing.Optional[str] = None
self.row = 0
self.col = 0
self.current_word = ''
self.on_split_row = False
self.split_tab = split_tab

def handle_starttag(self, tag: str, _: typing.Any) -> None:
def handle_starttag(
self, tag: str,
attributes: typing.List[typing.Tuple[str, typing.Optional[str]]]) -> None:
if tag == 'tr':
self.n_rows += 1
self.n_cols = 0
self.current_word = None
self.row += 1
self.col = 0
self.current_word = ''
self.on_split_row = False

if tag == 'td':
self.n_cols += 1
self.col += 1
for name, value in attributes:
if (name == 'id' and value == self.BUNSETSU_SPLIT_ID) or (
self.split_tab and name == 'id' and value == self.TAG_SPLIT_ID):
self.on_split_row = True

def handle_endtag(self, tag: str) -> None:
if tag != 'tr':
if tag != 'tr': # Skip all tags but TR.
return None
if self.row < 3: # Skip the first two rows.
return None
flag1 = self.n_rows > 2 and self.n_cols == 1
flag2 = self.split_tab or self.current_word == '文節区切り'
if flag1 and flag2:
self.chunks.append('')
if self.n_cols == 5 and type(self.current_word) is str:
if self.on_split_row:
return self.chunks.append('')
if self.col == 5:
self.chunks[-1] += self.current_word

def handle_data(self, data: str) -> None:
if self.n_cols == 1:
if self.col == 1:
self.current_word = data


Expand Down
30 changes: 30 additions & 0 deletions scripts/tests/test_prepare_knbc.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,33 @@ def test_multiple_hit(self) -> None:
chunks = ['abcabc', 'def']
result = prepare_knbc.break_before_sequence(chunks, 'bc')
self.assertListEqual(result, ['a', 'bca', 'bc', 'def'])


class TestKNBCHTMLParser(unittest.TestCase):
example_html = '''
<html>
<body>
<table>
<tr><th>HA</th><th>HB</th><th>HC</th><th>HD</th><th>HE</th></tr>
<tr><td colspan="5" id="bnst-kugiri"><a>文節区切り</a></td></tr>
<tr><td>abc</td><td></td><td></td><td></td><td></td></tr>
<tr><td>de</td><td></td><td></td><td></td><td></td></tr>
<tr><td colspan="5" id="tag-kugiri"><a>タグ区切り</a></td></tr>
<tr><td>fgh</td><td></td><td></td><td></td><td> </td></tr>
<tr><td>ijkl</td><td></td><td></td><td></td><td> </td></tr>
<tr><td colspan="5" id="bnst-kugiri"><a>文節区切り</a></td></tr>
<tr><td>mn</td><td></td><td></td><td></td><td> </td></tr>
</table>
</body>
</html>
'''

def test_parse(self) -> None:
parser = prepare_knbc.KNBCHTMLParser(False)
parser.feed(self.example_html)
self.assertListEqual(parser.chunks, ['abcdefghijkl', 'mn'])

def test_parse_split_tags(self) -> None:
parser = prepare_knbc.KNBCHTMLParser(True)
parser.feed(self.example_html)
self.assertListEqual(parser.chunks, ['abcde', 'fghijkl', 'mn'])

0 comments on commit 50178f6

Please sign in to comment.