-
Notifications
You must be signed in to change notification settings - Fork 303
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Compute features of librispeech and musan.
- Loading branch information
1 parent
40eed74
commit 0b19aa0
Showing
8 changed files
with
322 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
#!/usr/bin/env python3 | ||
|
||
""" | ||
This file computes fbank features of the librispeech dataset. | ||
Its looks for manifests in the directory data/manifests | ||
and generated fbank features are saved in data/fbank. | ||
""" | ||
|
||
import os | ||
import subprocess | ||
from contextlib import contextmanager | ||
from pathlib import Path | ||
|
||
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine | ||
from lhotse.recipes.utils import read_manifests_if_cached | ||
|
||
|
||
@contextmanager | ||
def get_executor(): | ||
# We'll either return a process pool or a distributed worker pool. | ||
# Note that this has to be a context manager because we might use multiple | ||
# context manager ("with" clauses) inside, and this way everything will | ||
# free up the resources at the right time. | ||
try: | ||
# If this is executed on the CLSP grid, we will try to use the | ||
# Grid Engine to distribute the tasks. | ||
# Other clusters can also benefit from that, provided a cluster-specific wrapper. | ||
# (see https://github.com/pzelasko/plz for reference) | ||
# | ||
# The following must be installed: | ||
# $ pip install dask distributed | ||
# $ pip install git+https://github.com/pzelasko/plz | ||
name = subprocess.check_output("hostname -f", shell=True, text=True) | ||
if name.strip().endswith(".clsp.jhu.edu"): | ||
import plz | ||
from distributed import Client | ||
|
||
with plz.setup_cluster() as cluster: | ||
cluster.scale(80) | ||
yield Client(cluster) | ||
return | ||
except: | ||
pass | ||
# No need to return anything - compute_and_store_features | ||
# will just instantiate the pool itself. | ||
yield None | ||
|
||
|
||
def compute_fbank_librispeech(): | ||
src_dir = Path("data/manifests") | ||
output_dir = Path("data/fbank") | ||
num_jobs = min(15, os.cpu_count()) | ||
num_mel_bins = 80 | ||
|
||
dataset_parts = ( | ||
"dev-clean", | ||
"dev-other", | ||
"test-clean", | ||
"test-other", | ||
"train-clean-100", | ||
"train-clean-360", | ||
"train-other-500", | ||
) | ||
manifests = read_manifests_if_cached( | ||
dataset_parts=dataset_parts, output_dir=src_dir | ||
) | ||
assert manifests is not None | ||
|
||
extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins)) | ||
|
||
with get_executor() as ex: # Initialize the executor only once. | ||
for partition, m in manifests.items(): | ||
if (output_dir / f"cuts_{partition}.json.gz").is_file(): | ||
print(f"{partition} already exists - skipping.") | ||
continue | ||
print("Processing", partition) | ||
cut_set = CutSet.from_manifests( | ||
recordings=m["recordings"], supervisions=m["supervisions"], | ||
) | ||
if "train" in partition: | ||
cut_set = ( | ||
cut_set | ||
+ cut_set.perturb_speed(0.9) | ||
+ cut_set.perturb_speed(1.1) | ||
) | ||
cut_set = cut_set.compute_and_store_features( | ||
extractor=extractor, | ||
storage_path=f"{output_dir}/feats_{partition}", | ||
# when an executor is specified, make more partitions | ||
num_jobs=num_jobs if ex is None else 80, | ||
executor=ex, | ||
storage_type=LilcomHdf5Writer, | ||
) | ||
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz") | ||
|
||
|
||
if __name__ == "__main__": | ||
compute_fbank_librispeech() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
#!/usr/bin/env python3 | ||
|
||
""" | ||
This file computes fbank features of the musan dataset. | ||
Its looks for manifests in the directory data/manifests | ||
and generated fbank features are saved in data/fbank. | ||
""" | ||
|
||
import os | ||
import subprocess | ||
from contextlib import contextmanager | ||
from pathlib import Path | ||
|
||
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine | ||
from lhotse.recipes.utils import read_manifests_if_cached | ||
|
||
|
||
@contextmanager | ||
def get_executor(): | ||
# We'll either return a process pool or a distributed worker pool. | ||
# Note that this has to be a context manager because we might use multiple | ||
# context manager ("with" clauses) inside, and this way everything will | ||
# free up the resources at the right time. | ||
try: | ||
# If this is executed on the CLSP grid, we will try to use the | ||
# Grid Engine to distribute the tasks. | ||
# Other clusters can also benefit from that, provided a cluster-specific wrapper. | ||
# (see https://github.com/pzelasko/plz for reference) | ||
# | ||
# The following must be installed: | ||
# $ pip install dask distributed | ||
# $ pip install git+https://github.com/pzelasko/plz | ||
name = subprocess.check_output("hostname -f", shell=True, text=True) | ||
if name.strip().endswith(".clsp.jhu.edu"): | ||
import plz | ||
from distributed import Client | ||
|
||
with plz.setup_cluster() as cluster: | ||
cluster.scale(80) | ||
yield Client(cluster) | ||
return | ||
except: | ||
pass | ||
# No need to return anything - compute_and_store_features | ||
# will just instantiate the pool itself. | ||
yield None | ||
|
||
|
||
def compute_fbank_musan(): | ||
src_dir = Path("data/manifests") | ||
output_dir = Path("data/fbank") | ||
num_jobs = min(15, os.cpu_count()) | ||
num_mel_bins = 80 | ||
|
||
dataset_parts = ( | ||
"music", | ||
"speech", | ||
"noise", | ||
) | ||
manifests = read_manifests_if_cached( | ||
dataset_parts=dataset_parts, output_dir=src_dir | ||
) | ||
assert manifests is not None | ||
|
||
musan_cuts_path = output_dir / "cuts_musan.json.gz" | ||
|
||
if musan_cuts_path.is_file(): | ||
print(f"{musan_cuts_path} already exists - skipping") | ||
return | ||
|
||
print("Extracting features for Musan") | ||
|
||
extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins)) | ||
|
||
with get_executor() as ex: # Initialize the executor only once. | ||
# create chunks of Musan with duration 5 - 10 seconds | ||
musan_cuts = ( | ||
CutSet.from_manifests( | ||
recordings=combine( | ||
part["recordings"] for part in manifests.values() | ||
) | ||
) | ||
.cut_into_windows(10.0) | ||
.filter(lambda c: c.duration > 5) | ||
.compute_and_store_features( | ||
extractor=extractor, | ||
storage_path=f"{output_dir}/feats_musan", | ||
num_jobs=num_jobs if ex is None else 80, | ||
executor=ex, | ||
storage_type=LilcomHdf5Writer, | ||
) | ||
) | ||
musan_cuts.to_json(musan_cuts_path) | ||
|
||
|
||
if __name__ == "__main__": | ||
compute_fbank_musan() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
#!/usr/bin/env python3 | ||
|
||
""" | ||
This file downloads the librispeech dataset | ||
to the directory data/LibriSpeech. | ||
It's compatible with kaldi's egs/librispeech/s5/local/download_and_untar.sh . | ||
""" | ||
|
||
|
||
from lhotse.recipes import download_librispeech | ||
|
||
|
||
def download_data(): | ||
target_dir = "data" | ||
|
||
download_librispeech(target_dir=target_dir, dataset_parts="librispeech") | ||
|
||
|
||
if __name__ == "__main__": | ||
download_data() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
#!/usr/bin/env python3 | ||
|
||
""" | ||
This file generates manifests for the librispeech dataset. | ||
It expects the dataset is saved in data/LibriSpeech | ||
and the generated manifests are saved in data/manifests. | ||
""" | ||
|
||
import os | ||
from pathlib import Path | ||
|
||
from lhotse.recipes import prepare_librispeech | ||
|
||
|
||
def prepare_librispeech_mainfest(): | ||
corpus_dir = Path("data/LibriSpeech") | ||
output_dir = Path("data/manifests") | ||
num_jobs = min(15, os.cpu_count()) | ||
|
||
librispeech_manifests = prepare_librispeech( | ||
corpus_dir=corpus_dir, | ||
dataset_parts="auto", | ||
output_dir=output_dir, | ||
num_jobs=num_jobs, | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
prepare_librispeech_mainfest() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
#!/usr/bin/env python3 | ||
|
||
""" | ||
This file generates manifests for the musan dataset. | ||
It expects the dataset is saved in data/musan | ||
and the generated manifests are saved in data/manifests. | ||
""" | ||
|
||
from pathlib import Path | ||
|
||
from lhotse.recipes import prepare_musan | ||
|
||
|
||
def prepare_musan_mainfest(): | ||
corpus_dir = Path("data/musan") | ||
output_dir = Path("data/manifests") | ||
|
||
prepare_musan(corpus_dir=corpus_dir, output_dir=output_dir) | ||
|
||
|
||
if __name__ == "__main__": | ||
prepare_musan_mainfest() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters