diff --git a/examples/speech_matrix/README.md b/examples/speech_matrix/README.md index ace7db0c5c..5e2c941062 100644 --- a/examples/speech_matrix/README.md +++ b/examples/speech_matrix/README.md @@ -48,6 +48,15 @@ Audios are saved to ${SAVE_ROOT}/audios/. For example, English audios are compre Speech alignments are saved to ${SAVE_ROOT}/aligned_speech/. For example, en-fr.tsv.gz contains a pair of aligned audio paths in English and French respectively together with their alignment score in each line. +## Speech Transcriptions + +While SpeechMatrix focuses on speech-only data mining and translation, we provide transcriptions for the mined speech in case they are needed for future research. The transcriptions are generated with [Whisper](https://github.com/openai/whisper), we use medium.en for English transcribing, and medium for other langauges. Curently transcriptions are provided the target speech in these language directions: {"cs", "de", "en", "es", "et", "fi", "fr", "hu", "it", "lt", "nl", "pl", "pt", "ro", "sl"}-{"de", "en", "es", "fr", "nl"}. + +```bash +# SAVE_ROOT: the directory to save mined data +python mined_train_sets/download_transcriptions.py \ + --save-root ${SAVE_ROOT} +``` ## Speech-to-Unit Data diff --git a/examples/speech_matrix/data_helper/data_cfg.py b/examples/speech_matrix/data_helper/data_cfg.py index d0a5bfc6b1..a3bdacb583 100644 --- a/examples/speech_matrix/data_helper/data_cfg.py +++ b/examples/speech_matrix/data_helper/data_cfg.py @@ -139,3 +139,11 @@ hubert_key = "hubert" vocoder_key = "vocoder" s2s_key = "s2s_models" +trans_key = "transcriptions" +# langs with transcriptions +TRANS_SRC_LANGS = [ + "cs", "de", "en", "es", "et", "fi", + "fr", "hu", "it", "lt", "nl", "pl", + "pt", "ro", "sl" +] +TRANS_TGT_LANGS = ["de", "en", "es", "fr", "nl"] \ No newline at end of file diff --git a/examples/speech_matrix/mined_train_sets/download_transcriptions.py b/examples/speech_matrix/mined_train_sets/download_transcriptions.py new file mode 100644 index 0000000000..33b4390c7e --- /dev/null +++ b/examples/speech_matrix/mined_train_sets/download_transcriptions.py @@ -0,0 +1,34 @@ +import os +import argparse +from examples.speech_matrix.data_helper.data_cfg import ( + DOWNLOAD_HUB, + VP_LANGS, + trans_key, + TRANS_SRC_LANGS, + TRANS_TGT_LANGS +) + + +def download_transcriptions(src_lang, tgt_lang, save_root): + save_dir = os.path.join(save_root, trans_key) + os.makedirs(save_dir, exist_ok=True) + + sorted_src_lang, sorted_tgt_lang = sorted([src_lang, tgt_lang]) + s2s_dl = f"{DOWNLOAD_HUB}/{trans_key}/{sorted_src_lang}-{sorted_tgt_lang}_{tgt_lang}.tsv.gz" + os.system(f"wget {s2s_dl} -P {save_dir}") + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("--save-root", type=str, required=True) + args = parser.parse_args() + + # download transcriptions of ta + for src_lang in TRANS_SRC_LANGS: + for tgt_lang in TRANS_TGT_LANGS: + if src_lang == tgt_lang: + continue + download_transcriptions( + src_lang, tgt_lang, save_root=args.save_root + ) diff --git a/examples/speech_matrix/valid_test_sets/download_vp_valid_test.py b/examples/speech_matrix/valid_test_sets/download_vp_valid_test.py new file mode 100644 index 0000000000..cfa6b3bd45 --- /dev/null +++ b/examples/speech_matrix/valid_test_sets/download_vp_valid_test.py @@ -0,0 +1,24 @@ +import os +import sys +import argparse +from examples.speech_matrix.data_helper.data_cfg import ( + DOWNLOAD_HUB, + audio_key +) + +def download_data(save_root): + save_dir = os.path.join(save_root, audio_key) + aud_dl = f"{DOWNLOAD_HUB}/{audio_key}/valid_test_vp_aud.zip" + os.system(f"wget {aud_dl} -P {save_dir}") + + +if __name__=="__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("--save-root", type=str, required=True) + args = parser.parse_args() + + download_data(args.save_root) + + +