From 3d73aa2a0bc9fd82f32d488d725169b69e5d7c4e Mon Sep 17 00:00:00 2001 From: Shuhei Iitsuka Date: Wed, 19 Apr 2023 06:21:01 +0000 Subject: [PATCH 1/3] Add test for KNBCHTMLParser Change-Id: I7cab02881e8dc2099e5d48c6a3c513e70fe5bb85 --- scripts/prepare_knbc.py | 56 ++++++++++++++++++++++-------- scripts/tests/test_prepare_knbc.py | 32 +++++++++++++++++ 2 files changed, 73 insertions(+), 15 deletions(-) diff --git a/scripts/prepare_knbc.py b/scripts/prepare_knbc.py index c9ae5419..fcb898ae 100644 --- a/scripts/prepare_knbc.py +++ b/scripts/prepare_knbc.py @@ -33,36 +33,62 @@ class KNBCHTMLParser(HTMLParser): - """Parses the HTML files in the KNBC corpus and outputs the chunks.""" + """Parses the HTML files in the KNBC corpus to collect chunks. + + Attributes: + chunks: The collected chunks. + row: The current row index. + col: The current column index. + current_word: The current word to process. + on_split_row: Whether the scan is on the splitting row. + split_tab: Whether to split by tags in addition to Bunsetsu. + """ + + BUNSETSU_SPLIT_ID = 'bnst-kugiri' + TAG_SPLIT_ID = 'tag-kugiri' + + def __init__(self, split_tab: bool = False) -> None: + """Initializes the HTML parser for the KNBC corpus. - def __init__(self, split_tab: bool = True) -> None: + Args: + split_tab: Split by tags in addition to Bunsetsu. (default: False) + """ super().__init__() self.chunks = [''] - self.n_rows = 0 - self.n_cols = 0 + self.row = 0 + self.col = 0 self.current_word: typing.Optional[str] = None + self.on_split_row = False self.split_tab = split_tab - def handle_starttag(self, tag: str, _: typing.Any) -> None: + def handle_starttag( + self, tag: str, + attributes: typing.List[typing.Tuple[str, typing.Optional[str]]]) -> None: if tag == 'tr': - self.n_rows += 1 - self.n_cols = 0 + self.row += 1 + self.col = 0 self.current_word = None + self.on_split_row = False + if tag == 'td': - self.n_cols += 1 + self.col += 1 + for name, value in attributes: + if (name == 'id' and value == self.BUNSETSU_SPLIT_ID) or ( + self.split_tab and name == 'id' and value == self.TAG_SPLIT_ID): + self.on_split_row = True def handle_endtag(self, tag: str) -> None: - if tag != 'tr': + if tag != 'tr': # Skip all tags but TR. + return None + if self.row < 3: # Skip the first two rows. return None - flag1 = self.n_rows > 2 and self.n_cols == 1 - flag2 = self.split_tab or self.current_word == '文節区切り' - if flag1 and flag2: - self.chunks.append('') - if self.n_cols == 5 and type(self.current_word) is str: + if self.on_split_row: + return self.chunks.append('') + if self.col == 5 and self.current_word: self.chunks[-1] += self.current_word def handle_data(self, data: str) -> None: - if self.n_cols == 1: + if self.col == 1: self.current_word = data diff --git a/scripts/tests/test_prepare_knbc.py b/scripts/tests/test_prepare_knbc.py index cdfbc38c..048a5261 100644 --- a/scripts/tests/test_prepare_knbc.py +++ b/scripts/tests/test_prepare_knbc.py @@ -40,3 +40,35 @@ def test_multiple_hit(self) -> None: chunks = ['abcabc', 'def'] result = prepare_knbc.break_before_sequence(chunks, 'bc') self.assertListEqual(result, ['a', 'bca', 'bc', 'def']) + + +class TestKNBCHTMLParser(unittest.TestCase): + + # extracted from KN001_Keitai_1-1-1-01-morph.html + example_html = ''' + + + + + + + + + + + + +
HAHBHCHDHE
文節区切り
abc
de
タグ区切り
fgh
ijkl
文節区切り
mn
+ + + ''' + + def test_parse(self) -> None: + parser = prepare_knbc.KNBCHTMLParser(False) + parser.feed(self.example_html) + self.assertListEqual(parser.chunks, ['abcdefghijkl', 'mn']) + + def test_parse_split_tags(self) -> None: + parser = prepare_knbc.KNBCHTMLParser(True) + parser.feed(self.example_html) + self.assertListEqual(parser.chunks, ['abcde', 'fghijkl', 'mn']) From 3c0cf8f1ef77bbb4f8e244e826b02bae02acb900 Mon Sep 17 00:00:00 2001 From: Shuhei Iitsuka Date: Wed, 19 Apr 2023 06:25:56 +0000 Subject: [PATCH 2/3] nit Change-Id: I18c2e1d9ff6584e5770d0644d2795126d01b99a1 --- scripts/tests/test_prepare_knbc.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/tests/test_prepare_knbc.py b/scripts/tests/test_prepare_knbc.py index 048a5261..1aa21519 100644 --- a/scripts/tests/test_prepare_knbc.py +++ b/scripts/tests/test_prepare_knbc.py @@ -43,8 +43,6 @@ def test_multiple_hit(self) -> None: class TestKNBCHTMLParser(unittest.TestCase): - - # extracted from KN001_Keitai_1-1-1-01-morph.html example_html = ''' From b198129d4ee7b7ddf3432f58ee8d9dd7614413ee Mon Sep 17 00:00:00 2001 From: Shuhei Iitsuka Date: Wed, 19 Apr 2023 06:28:16 +0000 Subject: [PATCH 3/3] Type current_word strictly Change-Id: I7cccaaa11f03f8c3241e92251a4c9d2d5445a2e3 --- scripts/prepare_knbc.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/prepare_knbc.py b/scripts/prepare_knbc.py index fcb898ae..c6e79162 100644 --- a/scripts/prepare_knbc.py +++ b/scripts/prepare_knbc.py @@ -57,7 +57,7 @@ def __init__(self, split_tab: bool = False) -> None: self.chunks = [''] self.row = 0 self.col = 0 - self.current_word: typing.Optional[str] = None + self.current_word = '' self.on_split_row = False self.split_tab = split_tab @@ -67,7 +67,7 @@ def handle_starttag( if tag == 'tr': self.row += 1 self.col = 0 - self.current_word = None + self.current_word = '' self.on_split_row = False if tag == 'td': @@ -84,7 +84,7 @@ def handle_endtag(self, tag: str) -> None: return None if self.on_split_row: return self.chunks.append('') - if self.col == 5 and self.current_word: + if self.col == 5: self.chunks[-1] += self.current_word def handle_data(self, data: str) -> None: