facebookresearch · hygong-fb · Dec 24, 2022 · Oct 2, 2023 · erip · Feb 7, 2023
diff --git a/examples/speech_matrix/README.md b/examples/speech_matrix/README.md
@@ -48,6 +48,15 @@ Audios are saved to ${SAVE_ROOT}/audios/. For example, English audios are compre
 
 Speech alignments are saved to ${SAVE_ROOT}/aligned_speech/. For example, en-fr.tsv.gz contains a pair of aligned audio paths in English and French respectively together with their alignment score in each line.
 
+## Speech Transcriptions
+
+While SpeechMatrix focuses on speech-only data mining and translation, we provide transcriptions for the mined speech in case they are needed for future research. The transcriptions are generated with [Whisper](https://github.com/openai/whisper), we use medium.en for English transcribing, and medium for other langauges. Curently transcriptions are provided the target speech in these language directions: {"cs", "de", "en", "es", "et", "fi", "fr", "hu", "it", "lt", "nl", "pl", "pt", "ro", "sl"}-{"de", "en", "es", "fr", "nl"}.
+
+```bash
+# SAVE_ROOT: the directory to save mined data
+python mined_train_sets/download_transcriptions.py \
+    --save-root ${SAVE_ROOT}
+```
 
 ## Speech-to-Unit Data
 

diff --git a/examples/speech_matrix/data_helper/data_cfg.py b/examples/speech_matrix/data_helper/data_cfg.py
@@ -139,3 +139,11 @@
 hubert_key = "hubert"
 vocoder_key = "vocoder"
 s2s_key = "s2s_models"
+trans_key = "transcriptions"
+# langs with transcriptions
+TRANS_SRC_LANGS = [
+    "cs", "de", "en", "es", "et", "fi",
+    "fr", "hu", "it", "lt", "nl", "pl",
+    "pt", "ro", "sl"
+]
+TRANS_TGT_LANGS = ["de", "en", "es", "fr", "nl"]
diff --git a/examples/speech_matrix/mined_train_sets/download_transcriptions.py b/examples/speech_matrix/mined_train_sets/download_transcriptions.py
@@ -0,0 +1,34 @@
+import os
+import argparse
+from examples.speech_matrix.data_helper.data_cfg import (
+    DOWNLOAD_HUB,
+    VP_LANGS,
+    trans_key,
+    TRANS_SRC_LANGS,
+    TRANS_TGT_LANGS
+)
+
+
+def download_transcriptions(src_lang, tgt_lang, save_root):
+    save_dir = os.path.join(save_root, trans_key)
+    os.makedirs(save_dir, exist_ok=True)
+
+    sorted_src_lang, sorted_tgt_lang = sorted([src_lang, tgt_lang])
+    s2s_dl = f"{DOWNLOAD_HUB}/{trans_key}/{sorted_src_lang}-{sorted_tgt_lang}_{tgt_lang}.tsv.gz"
+    os.system(f"wget {s2s_dl} -P {save_dir}")
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--save-root", type=str, required=True)
+    args = parser.parse_args()
+
+    # download transcriptions of ta
+    for src_lang in TRANS_SRC_LANGS:
+        for tgt_lang in TRANS_TGT_LANGS:
+            if src_lang == tgt_lang:
+                continue
+            download_transcriptions(
+                src_lang, tgt_lang, save_root=args.save_root
+            )