diff --git a/budoux/feature_extractor.py b/budoux/feature_extractor.py deleted file mode 100644 index 2cfd89ce..00000000 --- a/budoux/feature_extractor.py +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Methods to encode source sentences to features.""" - -import typing - -from .utils import INVALID - - -def get_feature(w1: str, w2: str, w3: str, w4: str, w5: str, - w6: str) -> typing.List[str]: - """Generates a feature from characters around (w1-6). - - Args: - w1 (str): The character 3 characters before the break point. - w2 (str): The character 2 characters before the break point. - w3 (str): The character right before the break point. - w4 (str): The character right after the break point. - w5 (str): The character 2 characters after the break point. - w6 (str): The character 3 characters after the break point. - - Returns: - The feature (list[str]). - - """ - raw_feature = { - 'UW1': w1, - 'UW2': w2, - 'UW3': w3, - 'UW4': w4, - 'UW5': w5, - 'UW6': w6, - 'BW1': w2 + w3, - 'BW2': w3 + w4, - 'BW3': w4 + w5, - 'TW1': w1 + w2 + w3, - 'TW2': w2 + w3 + w4, - 'TW3': w3 + w4 + w5, - 'TW4': w4 + w5 + w6, - } - for key, value in list(raw_feature.items()): - if INVALID in value: - del raw_feature[key] - return [f'{item[0]}:{item[1]}' for item in raw_feature.items()] diff --git a/budoux/utils.py b/budoux/utils.py index fd73d6ce..278fddaa 100644 --- a/budoux/utils.py +++ b/budoux/utils.py @@ -15,6 +15,3 @@ SEP = '▁' """The separator string to specify breakpoints.""" - -INVALID = '▔' -"""The invalid feature string.""" diff --git a/scripts/encode_data.py b/scripts/encode_data.py index f984116e..025c84e9 100644 --- a/scripts/encode_data.py +++ b/scripts/encode_data.py @@ -20,11 +20,51 @@ import sys import typing -from budoux import feature_extractor, utils +from budoux import utils ArgList = typing.Optional[typing.List[str]] DEFAULT_OUTPUT_FILENAME = 'encoded_data.txt' +INVALID = '▔' +"""The invalid feature string.""" + + +def get_feature(w1: str, w2: str, w3: str, w4: str, w5: str, + w6: str) -> typing.List[str]: + """Generates a feature from characters around (w1-6). + + Args: + w1 (str): The character 3 characters before the break point. + w2 (str): The character 2 characters before the break point. + w3 (str): The character right before the break point. + w4 (str): The character right after the break point. + w5 (str): The character 2 characters after the break point. + w6 (str): The character 3 characters after the break point. + + Returns: + The feature (list[str]). + + """ + raw_feature = { + 'UW1': w1, + 'UW2': w2, + 'UW3': w3, + 'UW4': w4, + 'UW5': w5, + 'UW6': w6, + 'BW1': w2 + w3, + 'BW2': w3 + w4, + 'BW3': w4 + w5, + 'TW1': w1 + w2 + w3, + 'TW2': w2 + w3 + w4, + 'TW3': w3 + w4 + w5, + 'TW4': w4 + w5 + w6, + } + for key, value in list(raw_feature.items()): + if INVALID in value: + del raw_feature[key] + return [f'{item[0]}:{item[1]}' for item in raw_feature.items()] + def parse_args(test: ArgList = None) -> argparse.Namespace: """Parses commandline arguments. @@ -65,12 +105,11 @@ def process(i: int, sentence: str, sep_indices: typing.Set[int]) -> str: sentence (str): A sentence sep_indices (typing.Set[int]): A set of separator indices. """ - feature = feature_extractor.get_feature( - sentence[i - 3] if i > 2 else utils.INVALID, - sentence[i - 2] if i > 1 else utils.INVALID, sentence[i - 1], - sentence[i] if i < len(sentence) else utils.INVALID, - sentence[i + 1] if i + 1 < len(sentence) else utils.INVALID, - sentence[i + 2] if i + 2 < len(sentence) else utils.INVALID) + feature = get_feature(sentence[i - 3] if i > 2 else INVALID, + sentence[i - 2] if i > 1 else INVALID, sentence[i - 1], + sentence[i] if i < len(sentence) else INVALID, + sentence[i + 1] if i + 1 < len(sentence) else INVALID, + sentence[i + 2] if i + 2 < len(sentence) else INVALID) positive = i in sep_indices line = '\t'.join(['1' if positive else '-1'] + feature) return line diff --git a/tests/test_encode_data.py b/scripts/tests/test_encode_data.py similarity index 72% rename from tests/test_encode_data.py rename to scripts/tests/test_encode_data.py index 03186d86..25161ded 100644 --- a/tests/test_encode_data.py +++ b/scripts/tests/test_encode_data.py @@ -15,16 +15,64 @@ import os import sys +import typing import unittest from budoux import utils # module hack -LIB_PATH = os.path.join(os.path.dirname(__file__), '..') +LIB_PATH = os.path.join(os.path.dirname(__file__), '..', '..') sys.path.insert(0, os.path.abspath(LIB_PATH)) + from scripts import encode_data # type: ignore # noqa (module hack) +class TestGetFeature(unittest.TestCase): + + def test_standard(self) -> None: + feature = encode_data.get_feature('a', 'b', 'c', 'd', 'e', 'f') + self.assertSetEqual( + set(feature), + { + # Unigram of Words (UW) + 'UW1:a', + 'UW2:b', + 'UW3:c', + 'UW4:d', + 'UW5:e', + 'UW6:f', + + # Bigram of Words (BW) + 'BW1:bc', + 'BW2:cd', + 'BW3:de', + + # Trigram of Words (TW) + 'TW1:abc', + 'TW2:bcd', + 'TW3:cde', + 'TW4:def', + }, + 'Features should be extracted.') + + def test_with_invalid(self) -> None: + + def find_by_prefix(prefix: str, feature: typing.List[str]) -> bool: + for item in feature: + if item.startswith(prefix): + return True + return False + + feature = encode_data.get_feature('a', 'a', encode_data.INVALID, 'a', 'a', + 'a') + self.assertFalse( + find_by_prefix('UW3:', feature), + 'Should omit the Unigram feature when the character is invalid.') + self.assertFalse( + find_by_prefix('BW2:', feature), + 'Should omit the Bigram feature that covers an invalid character.') + + class TestArgParse(unittest.TestCase): def test_cmdargs_invalid_option(self) -> None: @@ -107,3 +155,7 @@ def test_doubled_seps(self) -> None: sentence, sep_indices = encode_data.normalize_input(source) self.assertEqual(sentence, 'ABCDEFG') self.assertEqual(sep_indices, {3, 5, 7}) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_feature_extractor.py b/tests/test_feature_extractor.py deleted file mode 100644 index f537c006..00000000 --- a/tests/test_feature_extractor.py +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Tests methods for the feature extractor.""" - -import typing -import unittest - -from budoux import feature_extractor, utils - - -class TestFeatureExtractor(unittest.TestCase): - - def test_get_feature(self) -> None: - feature = feature_extractor.get_feature('a', 'b', 'c', 'd', 'e', 'f') - self.assertSetEqual( - set(feature), - { - # Unigram of Words (UW) - 'UW1:a', - 'UW2:b', - 'UW3:c', - 'UW4:d', - 'UW5:e', - 'UW6:f', - - # Bigram of Words (BW) - 'BW1:bc', - 'BW2:cd', - 'BW3:de', - - # Trigram of Words (TW) - 'TW1:abc', - 'TW2:bcd', - 'TW3:cde', - 'TW4:def', - }, - 'Features should be extracted.') - - def find_by_prefix(prefix: str, feature: typing.List[str]) -> bool: - for item in feature: - if item.startswith(prefix): - return True - return False - - feature = feature_extractor.get_feature('a', 'a', utils.INVALID, 'a', 'a', - 'a') - self.assertFalse( - find_by_prefix('UW3:', feature), - 'Should omit the Unigram feature when the character is invalid.') - self.assertFalse( - find_by_prefix('BW2:', feature), - 'Should omit the Bigram feature that covers an invalid character.') - - -if __name__ == '__main__': - unittest.main()