Skip to content

Commit

Permalink
Merge pull request #50 from mideind/fix/colon-time-correct-spaces
Browse files Browse the repository at this point in the history
Fix/colon time correct spaces
  • Loading branch information
gardarjuto authored Sep 13, 2024
2 parents 34b699f + 62b7bfa commit 2a71169
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 1 deletion.
15 changes: 14 additions & 1 deletion src/tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3223,7 +3223,20 @@ def correct_spaces(s: str) -> str:
# "bensínstöðvar, -dælur og -tankar"
r[-1] = " -"
r.append(w)
elif TP_SPACE[last - 1][this - 1] and r:
elif (
TP_SPACE[last - 1][this - 1]
and r
and not (
# Special case for colon-separated time or duration
# such as "12:00", "3:15" or "37:02:29"
w.isnumeric()
and len(w) == 2
and len(r) >= 2
and r[-1] == ":"
and (p := r[-2].strip()).isnumeric()
and len(p) in {1, 2}
)
):
r.append(" " + w)
else:
r.append(w)
Expand Down
9 changes: 9 additions & 0 deletions test/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1145,6 +1145,15 @@ def test_correct_spaces() -> None:
"Þetta er setning.Þetta er önnur setning.Líka.En hvað með þetta?"
)
assert s == "Þetta er setning. Þetta er önnur setning. Líka. En hvað með þetta?"
# Test that colon-separated times are not space-separated
s = t.correct_spaces("Klukkan er 12: 00 og ég ætla að fara út .")
assert s == "Klukkan er 12:00 og ég ætla að fara út."
s = t.correct_spaces("Tíminn byrjar kl . 4: 14 og klukkan er núna 5:00 .")
assert s == "Tíminn byrjar kl. 4:14 og klukkan er núna 5:00."
s = t.correct_spaces("Veislan verður kl . 12 : 00 - 14 : 00.")
assert s == "Veislan verður kl. 12:00-14:00."
s = t.correct_spaces("Hún kom í mark á tímanum 3 : 59 : 04 ,rétt fyrir lokin.")
assert s == "Hún kom í mark á tímanum 3:59:04, rétt fyrir lokin."


def test_abbrev() -> None:
Expand Down

0 comments on commit 2a71169

Please sign in to comment.