From 589f93358d700c91ac4852ea58ca2e29d3d4c36c Mon Sep 17 00:00:00 2001
From: Wei Kang <wkang.pku@gmail.com>
Date: Sun, 14 Jan 2024 22:28:29 +0800
Subject: [PATCH] If break at period, it has to be followed by space (#61)

---
 examples/libriheavy/matching.py       |  2 +-
 textsearch/python/textsearch/match.py | 72 ++++++++++++++++-----------
 2 files changed, 45 insertions(+), 29 deletions(-)

diff --git a/examples/libriheavy/matching.py b/examples/libriheavy/matching.py
index 8767dfa..0830871 100755
--- a/examples/libriheavy/matching.py
+++ b/examples/libriheavy/matching.py
@@ -95,7 +95,7 @@ def get_params() -> AttributeDict:
             "preceding_context_length": 1000,
             "timestamp_position": "current",
             "silence_length_to_break": 0.45,
-            "overlap_ratio": 0.45,
+            "overlap_ratio": 0.5,
             "min_duration": 2,
             "max_duration": 30,
             "expected_duration": (5, 20),
diff --git a/textsearch/python/textsearch/match.py b/textsearch/python/textsearch/match.py
index 7917f1b..dfa7f58 100644
--- a/textsearch/python/textsearch/match.py
+++ b/textsearch/python/textsearch/match.py
@@ -704,55 +704,71 @@ def _get_segment_candidates(
         # punctuation
         prev_punctuation = 0
         j = align["ref_pos"] - 1
+        num_space_behind_period = 0
         while j >= 0:
             current_token = chr(target_source.binary_text[j])
             if is_punctuation(current_token, eos_only=True):
-                tmp = "".join(
-                    [
-                        chr(x)
-                        for x in target_source.binary_text[
-                            j - period_pattern_length : j + 1
+                if current_token == ".":
+                    tmp = "".join(
+                        [
+                            chr(x)
+                            for x in target_source.binary_text[
+                                j - period_pattern_length : j + 1
+                            ]
                         ]
-                    ]
-                )
-                if current_token != "." or (
-                    current_token == "."
-                    and period_patterns.search(tmp) is not None
-                ):
+                    )
+                    if (
+                        period_patterns.search(tmp) is not None
+                        and num_space_behind_period >= 1
+                    ):
+                        prev_punctuation = punctuation_score
+                        break
+                    else:
+                        break
+                else:
                     prev_punctuation = punctuation_score
                     break
-                else:
-                    j -= 1
             elif current_token == " " or is_punctuation(current_token):
+                if current_token == " ":
+                    num_space_behind_period += 1
                 j -= 1
             else:
                 break
 
         succ_punctuation = 0
         j = align["ref_pos"] + 1
+        followed_by_period = False
+        followed_by_other_eos = False
+        num_space_behind_period = 0
         while j < target_source.binary_text.size:
             current_token = chr(target_source.binary_text[j])
             if is_punctuation(current_token, eos_only=True):
-                tmp = "".join(
-                    [
-                        chr(x)
-                        for x in target_source.binary_text[
-                            j - period_pattern_length : j + 1
+                if current_token == ".":
+                    tmp = "".join(
+                        [
+                            chr(x)
+                            for x in target_source.binary_text[
+                                j - period_pattern_length : j + 1
+                            ]
                         ]
-                    ]
-                )
-                if current_token != "." or (
-                    current_token == "."
-                    and period_patterns.search(tmp) is not None
-                ):
-                    succ_punctuation = punctuation_score
-                    break
+                    )
+                    if period_patterns.search(tmp) is not None:
+                        followed_by_period = True
                 else:
-                    j += 1
+                    followed_by_other_eos = True
+                j += 1
             elif current_token == " " or is_punctuation(current_token):
+                if current_token == " ":
+                    num_space_behind_period += 1
                 j += 1
             else:
-                break
+                if (
+                    followed_by_period and num_space_behind_period >= 1
+                ) or followed_by_other_eos:
+                    succ_punctuation = punctuation_score
+                    break
+                else:
+                    break
 
         begin_score = (
             prev_silence