-
Notifications
You must be signed in to change notification settings - Fork 4
/
relative_scp_to_abs.py
76 lines (71 loc) · 2.67 KB
/
relative_scp_to_abs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from pathlib import Path
import typer
from typing import Optional
from typing_extensions import Annotated
def main(
vpc_baseline_path: Annotated[Optional[Path], typer.Option()],
to: str = 'absolute'
):
"""
This script adapts evaluation datasets generated by the VoicePrivacy
Challenge 2022 framework codebase. The necessary changes are:
* Adapting the .scp files such that data paths are not relative to
the VPC baseline/ folder anymore but absolute.
* The changes can be also reverted with the same script. To do so,
just call it with --to=relative
"""
vpc_baseline_path = vpc_baseline_path.expanduser()
assert vpc_baseline_path.exists(), \
f'The supplied path to VPC framework ({vpc_baseline_path}) does not exist'
# determine the transform
if to.casefold() == 'absolute':
transform = lambda string: f'{vpc_baseline_path / string}'
elif to.casefold() == 'relative':
transform = lambda string: Path(string).relative_to(vpc_baseline_path)
# for each dataset load .scp
dataset_list = [
'vctk_test_trials_m',
'vctk_test_trials_m_common',
'vctk_test_trials_m_all',
'vctk_test_trials_f',
'vctk_test_trials_f_common',
'vctk_test_trials_f_all',
'vctk_test_trials_all',
'vctk_test_enrolls',
'vctk_dev_trials_m',
'vctk_dev_trials_m_common',
'vctk_dev_trials_m_all',
'vctk_dev_trials_f',
'vctk_dev_trials_f_common',
'vctk_dev_trials_f_all',
'vctk_dev_trials_all',
'vctk_dev_enrolls',
'libri_test_trials_m',
'libri_test_trials_f',
'libri_test_trials_all',
'libri_test_enrolls',
'libri_dev_trials_m',
'libri_dev_trials_f',
'libri_dev_trials_all',
'libri_dev_enrolls',
]
for dataset in dataset_list:
dataset_path = vpc_baseline_path / 'data' / dataset
lines = []
with open(dataset_path / 'wav.scp') as scp:
for line in scp.readlines():
# scp format is {utt} {path}\n
items = line.split(' ')
new_line = f'{items[0]} {transform(items[1].strip())}\n'
lines.append(new_line)
# validate
for line in lines:
line = line.split(' ')[1].strip()
if to == 'relative':
line = vpc_baseline_path / line
assert Path(line).exists(), f'Line {line} has issues, exiting.'
with open(dataset_path / 'wav.scp', mode='w') as scp:
for line in lines:
scp.writelines(line)
if __name__ == '__main__':
typer.run(main)