From 589f93358d700c91ac4852ea58ca2e29d3d4c36c Mon Sep 17 00:00:00 2001 From: Wei Kang Date: Sun, 14 Jan 2024 22:28:29 +0800 Subject: [PATCH] If break at period, it has to be followed by space (#61) --- examples/libriheavy/matching.py | 2 +- textsearch/python/textsearch/match.py | 72 ++++++++++++++++----------- 2 files changed, 45 insertions(+), 29 deletions(-) diff --git a/examples/libriheavy/matching.py b/examples/libriheavy/matching.py index 8767dfa..0830871 100755 --- a/examples/libriheavy/matching.py +++ b/examples/libriheavy/matching.py @@ -95,7 +95,7 @@ def get_params() -> AttributeDict: "preceding_context_length": 1000, "timestamp_position": "current", "silence_length_to_break": 0.45, - "overlap_ratio": 0.45, + "overlap_ratio": 0.5, "min_duration": 2, "max_duration": 30, "expected_duration": (5, 20), diff --git a/textsearch/python/textsearch/match.py b/textsearch/python/textsearch/match.py index 7917f1b..dfa7f58 100644 --- a/textsearch/python/textsearch/match.py +++ b/textsearch/python/textsearch/match.py @@ -704,55 +704,71 @@ def _get_segment_candidates( # punctuation prev_punctuation = 0 j = align["ref_pos"] - 1 + num_space_behind_period = 0 while j >= 0: current_token = chr(target_source.binary_text[j]) if is_punctuation(current_token, eos_only=True): - tmp = "".join( - [ - chr(x) - for x in target_source.binary_text[ - j - period_pattern_length : j + 1 + if current_token == ".": + tmp = "".join( + [ + chr(x) + for x in target_source.binary_text[ + j - period_pattern_length : j + 1 + ] ] - ] - ) - if current_token != "." or ( - current_token == "." - and period_patterns.search(tmp) is not None - ): + ) + if ( + period_patterns.search(tmp) is not None + and num_space_behind_period >= 1 + ): + prev_punctuation = punctuation_score + break + else: + break + else: prev_punctuation = punctuation_score break - else: - j -= 1 elif current_token == " " or is_punctuation(current_token): + if current_token == " ": + num_space_behind_period += 1 j -= 1 else: break succ_punctuation = 0 j = align["ref_pos"] + 1 + followed_by_period = False + followed_by_other_eos = False + num_space_behind_period = 0 while j < target_source.binary_text.size: current_token = chr(target_source.binary_text[j]) if is_punctuation(current_token, eos_only=True): - tmp = "".join( - [ - chr(x) - for x in target_source.binary_text[ - j - period_pattern_length : j + 1 + if current_token == ".": + tmp = "".join( + [ + chr(x) + for x in target_source.binary_text[ + j - period_pattern_length : j + 1 + ] ] - ] - ) - if current_token != "." or ( - current_token == "." - and period_patterns.search(tmp) is not None - ): - succ_punctuation = punctuation_score - break + ) + if period_patterns.search(tmp) is not None: + followed_by_period = True else: - j += 1 + followed_by_other_eos = True + j += 1 elif current_token == " " or is_punctuation(current_token): + if current_token == " ": + num_space_behind_period += 1 j += 1 else: - break + if ( + followed_by_period and num_space_behind_period >= 1 + ) or followed_by_other_eos: + succ_punctuation = punctuation_score + break + else: + break begin_score = ( prev_silence