Skip to content

Commit

Permalink
Remove the P features (#80)
Browse files Browse the repository at this point in the history
* Remove the past result features

* Update the Japanese model

* Update the Simplified Chinese model
  • Loading branch information
tushuhei authored Aug 3, 2022
1 parent b767d79 commit 89c56db
Show file tree
Hide file tree
Showing 7 changed files with 12 additions and 78 deletions.
25 changes: 3 additions & 22 deletions budoux/feature_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@ def unicode_block_index(w: str) -> str:
return '%03d' % (bisect.bisect_right(block_starts, ord(w[0])))


def get_feature(w1: str, w2: str, w3: str, w4: str, w5: str, w6: str, p1: str,
p2: str, p3: str) -> typing.List[str]:
"""Generates a feature from characters around (w1-6) and past results (p1-3).
def get_feature(w1: str, w2: str, w3: str, w4: str, w5: str,
w6: str) -> typing.List[str]:
"""Generates a feature from characters around (w1-6).
Args:
w1 (str): The character 3 characters before the break point.
Expand All @@ -49,9 +49,6 @@ def get_feature(w1: str, w2: str, w3: str, w4: str, w5: str, w6: str, p1: str,
w4 (str): The character right after the break point.
w5 (str): The character 2 characters after the break point.
w6 (str): The character 3 characters after the break point.
p1 (str): The result 3 steps ago.
p2 (str): The result 2 steps ago.
p3 (str): The last result.
Returns:
The feature (list[str]).
Expand All @@ -64,11 +61,6 @@ def get_feature(w1: str, w2: str, w3: str, w4: str, w5: str, w6: str, p1: str,
b5 = unicode_block_index(w5)
b6 = unicode_block_index(w6)
raw_feature = {
'UP1': p1,
'UP2': p2,
'UP3': p3,
'BP1': p1 + p2,
'BP2': p2 + p3,
'UW1': w1,
'UW2': w2,
'UW3': w3,
Expand All @@ -95,17 +87,6 @@ def get_feature(w1: str, w2: str, w3: str, w4: str, w5: str, w6: str, p1: str,
'TB2': b2 + b3 + b4,
'TB3': b3 + b4 + b5,
'TB4': b4 + b5 + b6,
'UQ1': p1 + b1,
'UQ2': p2 + b2,
'UQ3': p3 + b3,
'BQ1': p2 + b2 + b3,
'BQ2': p2 + b3 + b4,
'BQ3': p3 + b2 + b3,
'BQ4': p3 + b3 + b4,
'TQ1': p2 + b1 + b2 + b3,
'TQ2': p2 + b2 + b3 + b4,
'TQ3': p3 + b1 + b2 + b3,
'TQ4': p3 + b2 + b3 + b4,
}
for key, value in list(raw_feature.items()):
if INVALID in value:
Expand Down
3 changes: 1 addition & 2 deletions budoux/models/ja-knbc.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion budoux/models/zh-hans.json

Large diffs are not rendered by default.

11 changes: 2 additions & 9 deletions budoux/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from html.parser import HTMLParser

from .feature_extractor import get_feature
from .utils import INVALID, SEP, Result
from .utils import INVALID, SEP

MODEL_DIR = os.path.join(os.path.dirname(__file__), 'models')
PARENT_CSS_STYLE = 'word-break: keep-all; overflow-wrap: break-word;'
Expand Down Expand Up @@ -121,27 +121,20 @@ def parse(self, sentence: str) -> typing.List[str]:
"""
if sentence == '':
return []
p1 = Result.UNKNOWN.value
p2 = Result.UNKNOWN.value
p3 = Result.UNKNOWN.value
chunks = [sentence[0]]
base_score = -sum(self.model.values())
for i in range(1, len(sentence)):
feature = get_feature(
sentence[i - 3] if i > 2 else INVALID,
sentence[i - 2] if i > 1 else INVALID, sentence[i - 1], sentence[i],
sentence[i + 1] if i + 1 < len(sentence) else INVALID,
sentence[i + 2] if i + 2 < len(sentence) else INVALID, p1, p2, p3)
sentence[i + 2] if i + 2 < len(sentence) else INVALID)
score = base_score + 2 * sum(
self.model[f] for f in feature if f in self.model)
if score > 0:
chunks.append(sentence[i])
else:
chunks[-1] += sentence[i]
p = Result.POSITIVE.value if score > 0 else Result.NEGATIVE.value
p1 = p2
p2 = p3
p3 = p
return chunks

def translate_html_string(self, html: str) -> str:
Expand Down
9 changes: 0 additions & 9 deletions budoux/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,8 @@
# limitations under the License.
"""Utilities for BudouX."""

from enum import Enum

SEP = '▁'
"""The separator string to specify breakpoints."""

INVALID = '▔'
"""The invalid feature string."""


class Result(Enum):
"""An enum to represent the type of inference result."""
UNKNOWN = 'U'
POSITIVE = 'B'
NEGATIVE = 'O'
9 changes: 1 addition & 8 deletions scripts/encode_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,23 +31,16 @@ def process(line: str, entries_filename: str) -> None:
chunk_lengths = [len(chunk) for chunk in chunks]
sep_indices = set(itertools.accumulate(chunk_lengths, lambda x, y: x + y))
sentence = ''.join(chunks)
p1 = utils.Result.UNKNOWN.value
p2 = utils.Result.UNKNOWN.value
p3 = utils.Result.UNKNOWN.value
lines = []
for i in range(1, len(sentence) + 1):
feature = feature_extractor.get_feature(
sentence[i - 3] if i > 2 else utils.INVALID,
sentence[i - 2] if i > 1 else utils.INVALID, sentence[i - 1],
sentence[i] if i < len(sentence) else utils.INVALID,
sentence[i + 1] if i + 1 < len(sentence) else utils.INVALID,
sentence[i + 2] if i + 2 < len(sentence) else utils.INVALID, p1, p2, p3)
sentence[i + 2] if i + 2 < len(sentence) else utils.INVALID)
positive = i in sep_indices
p = utils.Result.POSITIVE.value if positive else utils.Result.NEGATIVE.value
lines.append('\t'.join(['1' if positive else '-1'] + feature) + '\n')
p1 = p2
p2 = p3
p3 = p
with open(entries_filename, 'a', encoding=sys.getdefaultencoding()) as f:
f.write(''.join(lines))

Expand Down
31 changes: 4 additions & 27 deletions tests/test_feature_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,7 @@ def check(character: str, block: str, msg: str) -> None:
'Should return INVALID when INVALID is given.')

def test_get_feature(self) -> None:
feature = feature_extractor.get_feature('a', 'b', 'c', 'd', 'e', 'f', 'x',
'y', 'z')
feature = feature_extractor.get_feature('a', 'b', 'c', 'd', 'e', 'f')
self.assertSetEqual(
set(feature),
{
Expand All @@ -50,11 +49,6 @@ def test_get_feature(self) -> None:
'UW5:e',
'UW6:f',

# Unigram of Previous Results (UP)
'UP1:x',
'UP2:y',
'UP3:z',

# Unigram of Unicode Blocks (UB)
'UB1:001',
'UB2:001',
Expand All @@ -63,28 +57,15 @@ def test_get_feature(self) -> None:
'UB5:001',
'UB6:001',

# Combination of UW and UP
'UQ1:x001',
'UQ2:y001',
'UQ3:z001',

# Bigram of Words (BW), Previous Results (BP), Unicode Blocks (BB), and
# its combination (BQ)
# Bigram of Words (BW) and Unicode Blocks (BB)
'BW1:bc',
'BW2:cd',
'BW3:de',
'BP1:xy',
'BP2:yz',
'BB1:001001',
'BB2:001001',
'BB3:001001',
'BQ1:y001001',
'BQ2:y001001',
'BQ3:z001001',
'BQ4:z001001',

# Trigram of Words (BW), Previous Results (BP), Unicode Blocks (BB), and
# its combination (BQ)
# Trigram of Words (TW) and Unicode Blocks (TB)
'TW1:abc',
'TW2:bcd',
'TW3:cde',
Expand All @@ -93,10 +74,6 @@ def test_get_feature(self) -> None:
'TB2:001001001',
'TB3:001001001',
'TB4:001001001',
'TQ1:y001001001',
'TQ2:y001001001',
'TQ3:z001001001',
'TQ4:z001001001',
},
'Features should be extracted.')

Expand All @@ -107,7 +84,7 @@ def find_by_prefix(prefix: str, feature: typing.List[str]) -> bool:
return False

feature = feature_extractor.get_feature('a', 'a', utils.INVALID, 'a', 'a',
'a', 'a', 'a', 'a')
'a')
self.assertFalse(
find_by_prefix('UW3:', feature),
'Should omit the Unigram feature when the character is invalid.')
Expand Down

0 comments on commit 89c56db

Please sign in to comment.