Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge japanese-to-english multilingual branch #1860

Merged
merged 36 commits into from
Feb 3, 2025
Merged
Changes from 1 commit
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
5062f12
add streaming support to reazonresearch
Aug 1, 2024
5a0c247
update README for streaming
Aug 1, 2024
62eb090
update streaming decoding file
Aug 1, 2024
6317405
update streaming decode
Aug 1, 2024
2d2daf6
remove streaming/greedy_search results folder
Aug 1, 2024
e052481
update for streaming
Aug 1, 2024
529d92f
Add docker images for torch 2.4 (#1704)
csukuangfj Jul 25, 2024
8189d11
remove prints
Aug 1, 2024
916e84d
resolve PR issue
Aug 1, 2024
707a956
Add multi_ja_en
Sep 14, 2024
2e355a8
Update README.md
baileyeet Sep 14, 2024
b6af607
Update README.md
baileyeet Dec 24, 2024
7aedda0
Update RESULTS.md
baileyeet Dec 24, 2024
7b1445b
Update RESULTS.md
baileyeet Dec 25, 2024
4a55a10
Update RESULTS.md
baileyeet Dec 25, 2024
1bc7f07
Delete egs/multi_ja_en/ASR/zipformer/streaming/greedy_search directory
baileyeet Dec 25, 2024
68e1c3c
formatting
baileyeet Nov 25, 2024
a2bb272
formatting
baileyeet Nov 25, 2024
4604be8
add onnx decode
baileyeet Dec 25, 2024
f421001
remove unnecessary folders
baileyeet Dec 25, 2024
564b632
fix repeated definition of tokenize_by_ja_char
baileyeet Jan 7, 2025
5c142d4
Merge branch 'master' into einichi
baileyeet Jan 7, 2025
8a3790c
clean up files
baileyeet Jan 8, 2025
9d6211e
remove test
baileyeet Jan 8, 2025
84c91db
edit prepare.sh
baileyeet Jan 14, 2025
1244de9
update python ver
baileyeet Jan 14, 2025
aa74f6c
update python ver
baileyeet Jan 14, 2025
b574e68
udpate symlink
baileyeet Jan 14, 2025
9ab3021
Reformatted streaming_decode.py with flake8
baileyeet Jan 14, 2025
3eec244
Update RESULTS.md
baileyeet Jan 20, 2025
efc0536
Merge branch 'k2-fsa:master' into einichi
baileyeet Jan 27, 2025
50c3270
Update generate_build_matrix.py
JinZr Jan 28, 2025
b8ce806
Update build-docker-image.yml
JinZr Jan 28, 2025
5cf7e42
Update zipformer.py
JinZr Jan 28, 2025
1cb4594
Update zipformer.py
JinZr Jan 28, 2025
b9efbf8
Update utils.py
JinZr Jan 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix repeated definition of tokenize_by_ja_char
baileyeet committed Jan 7, 2025
commit 564b632eda65ea678721b0de8ce42364befe0424
24 changes: 1 addition & 23 deletions egs/multi_ja_en/ASR/local/prepare_for_bpe_model.py
Original file line number Diff line number Diff line change
@@ -25,29 +25,7 @@

from tqdm.auto import tqdm


def tokenize_by_ja_char(line: str) -> str:
"""
Tokenize a line of text with Japanese characters.
Note: All non-Japanese characters will be upper case.
Example:
input = "こんにちは世界は hello world の日本語"
output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
Args:
line:
The input text.
Return:
A new string tokenized by Japanese characters.
"""
pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
chars = pattern.split(line.strip())
return " ".join(
[w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
)
from icefall.utils import tokenize_by_ja_char


def get_args():
26 changes: 1 addition & 25 deletions egs/multi_ja_en/ASR/local/prepare_lang_bbpe.py
Original file line number Diff line number Diff line change
@@ -50,31 +50,7 @@
)

from icefall.byte_utils import byte_encode
from icefall.utils import str2bool


def tokenize_by_ja_char(line: str) -> str:
"""
Tokenize a line of text with Japanese characters.
Note: All non-Japanese characters will be upper case.
Example:
input = "こんにちは世界は hello world の日本語"
output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
Args:
line:
The input text.
Return:
A new string tokenized by Japanese characters.
"""
pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
chars = pattern.split(line.strip())
return " ".join(
[w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
)
from icefall.utils import str2bool, tokenize_by_ja_char


def lexicon_to_fst_no_sil(
25 changes: 1 addition & 24 deletions egs/multi_ja_en/ASR/local/train_bbpe_model.py
Original file line number Diff line number Diff line change
@@ -33,30 +33,7 @@
import sentencepiece as spm

from icefall import byte_encode


def tokenize_by_ja_char(line: str) -> str:
"""
Tokenize a line of text with Japanese characters.
Note: All non-Japanese characters will be upper case.
Example:
input = "こんにちは世界は hello world の日本語"
output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
Args:
line:
The input text.
Return:
A new string tokenized by Japanese characters.
"""
pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
chars = pattern.split(line.strip())
return " ".join(
[w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
)
from icefall.utils import tokenize_by_ja_char


def get_args():
25 changes: 1 addition & 24 deletions egs/multi_ja_en/ASR/zipformer/decode.py
Original file line number Diff line number Diff line change
@@ -96,36 +96,13 @@
setup_logger,
store_transcripts,
str2bool,
tokenize_by_ja_char,
write_error_stats,
)

LOG_EPS = math.log(1e-10)


def tokenize_by_ja_char(line: str) -> str:
"""
Tokenize a line of text with Japanese characters.
Note: All non-Japanese characters will be upper case.
Example:
input = "こんにちは世界は hello world の日本語"
output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
Args:
line:
The input text.
Return:
A new string tokenized by Japanese characters.
"""
pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
chars = pattern.split(line.strip())
return " ".join(
[w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
)


def get_parser():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
25 changes: 1 addition & 24 deletions egs/multi_ja_en/ASR/zipformer/train.py
Original file line number Diff line number Diff line change
@@ -101,35 +101,12 @@
get_parameter_groups_with_lrs,
setup_logger,
str2bool,
tokenize_by_ja_char,
)

LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]


def tokenize_by_ja_char(line: str) -> str:
"""
Tokenize a line of text with Japanese characters.
Note: All non-Japanese characters will be upper case.
Example:
input = "こんにちは世界は hello world の日本語"
output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
Args:
line:
The input text.
Return:
A new string tokenized by Japanese characters.
"""
pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
chars = pattern.split(line.strip())
return " ".join(
[w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
)


def get_adjusted_batch_count(params: AttributeDict) -> float:
# returns the number of batches we would have used so far if we had used the reference
# duration. This is for purposes of set_batch_count().
1 change: 1 addition & 0 deletions icefall/__init__.py
Original file line number Diff line number Diff line change
@@ -68,6 +68,7 @@
str2bool,
subsequent_chunk_mask,
tokenize_by_CJK_char,
tokenize_by_ja_char,
write_error_stats,
)

24 changes: 24 additions & 0 deletions icefall/utils.py
Original file line number Diff line number Diff line change
@@ -1746,6 +1746,30 @@ def tokenize_by_CJK_char(line: str) -> str:
return " ".join([w.strip() for w in chars if w.strip()])


def tokenize_by_ja_char(line: str) -> str:
"""
Tokenize a line of text with Japanese characters.
Note: All non-Japanese characters will be upper case.
Example:
input = "こんにちは世界は hello world の日本語"
output = "こ ん に ち は 世 界 は HELLO WORLD の 日 本 語"
Args:
line:
The input text.
Return:
A new string tokenized by Japanese characters.
"""
pattern = re.compile(r"([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF])")
chars = pattern.split(line.strip())
return " ".join(
[w.strip().upper() if not pattern.match(w) else w for w in chars if w.strip()]
)


def display_and_save_batch(
batch: dict,
params: AttributeDict,