Skip to content

Commit

Permalink
[egs] Make voxceleb recipe work with latest version of the dataset (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
sunshines14 authored and danpovey committed Apr 19, 2019
1 parent 84ecd0e commit c3260f2
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 8 deletions.
123 changes: 123 additions & 0 deletions egs/voxceleb/v1/local/make_voxceleb1_v2.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#!/usr/bin/perl
#
# Copyright 2018 Ewald Enzinger
# 2018 David Snyder
# 2019 Soonshin Seo
#
# Usage: make_voxceleb1_v2.pl /export/voxceleb1 dev data/dev
#
# The VoxCeleb1 corpus underwent several updates that changed the directory and speaker ID format.
# The script 'make_voxceleb1.pl' works for the oldest version of the corpus.
# This script should be used if you've downloaded the corpus recently.

if (@ARGV != 3) {
print STDERR "Usage: $0 <path-to-voxceleb1> <dataset> <path-to-data-dir>\n";
print STDERR "e.g. $0 /export/voxceleb1 dev data/dev\n";
exit(1);
}

($data_base, $dataset, $out_dir) = @ARGV;

if ("$dataset" ne "dev" && "$dataset" ne "test") {
die "dataset parameter must be 'dev' or 'test'!";
}

if (system("mkdir -p $out_dir") != 0) {
die "Error making directory $out_dir";
}

opendir my $dh, "$data_base/$dataset/wav" or die "Cannot open directory: $!";
my @spkr_dirs = grep {-d "$data_base/$dataset/wav/$_" && ! /^\.{1,2}$/} readdir($dh);
closedir $dh;

if ($dataset eq "dev"){
open(SPKR_TRAIN, ">", "$out_dir/utt2spk") or die "could not open the output file $out_dir/utt2spk";
open(WAV_TRAIN, ">", "$out_dir/wav.scp") or die "could not open the output file $out_dir/wav.scp";

foreach (@spkr_dirs) {
my $spkr_id = $_;
opendir my $dh, "$data_base/$dataset/wav/$spkr_id/" or die "Cannot open directory: $!";
my @rec_dirs = grep {-d "$data_base/$dataset/wav/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh);
closedir $dh;
foreach (@rec_dirs) {
my $rec_id = $_;
opendir my $dh, "$data_base/$dataset/wav/$spkr_id/$rec_id/" or die "Cannot open directory: $!";
my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
closedir $dh;
foreach (@files) {
my $name = $_;
my $wav = "$data_base/$dataset/wav/$spkr_id/$rec_id/$name.wav";
my $utt_id = "$spkr_id-$rec_id-$name";
print WAV_TRAIN "$utt_id", " $wav", "\n";
print SPKR_TRAIN "$utt_id", " $spkr_id", "\n";
}
}
}
close(SPKR_TRAIN) or die;
close(WAV_TRAIN) or die;
}

if ($dataset eq "test"){
if (! -e "$data_base/voxceleb1_test_v2.txt") {
system("wget -O $data_base/voxceleb1_test_v2.txt http://www.openslr.org/resources/49/voxceleb1_test_v2.txt");
}

open(TRIAL_IN, "<", "$data_base/voxceleb1_test_v2.txt") or die "could not open the verification trials file $data_base/voxceleb1_test_v2.txt";
open(TRIAL_OUT, ">", "$out_dir/trials") or die "Could not open the output file $out_test_dir/trials";
open(SPKR_TEST, ">", "$out_dir/utt2spk") or die "could not open the output file $out_dir/utt2spk";
open(WAV_TEST, ">", "$out_dir/wav.scp") or die "could not open the output file $out_dir/wav.scp";

my $test_spkrs = ();
while (<TRIAL_IN>) {
chomp;
my ($tar_or_non, $path1, $path2) = split;
# Create entry for left-hand side of trial
my ($spkr_id, $rec_id, $name) = split('/', $path1);
my $utt_id1 = "$spkr_id-$rec_id-$name";
$test_spkrs{$spkr_id} = ();

# Create entry for right-hand side of trial
my ($spkr_id, $rec_id, $name) = split('/', $path2);
my $utt_id2 = "$spkr_id-$rec_id-$name";
$test_spkrs{$spkr_id} = ();

my $target = "nontarget";
if ($tar_or_non eq "1") {
$target = "target";
}
print TRIAL_OUT "$utt_id1 $utt_id2 $target\n";
}

foreach (@spkr_dirs) {
my $spkr_id = $_;
opendir my $dh, "$data_base/$dataset/wav/$spkr_id/" or die "Cannot open directory: $!";
my @rec_dirs = grep {-d "$data_base/$dataset/wav/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh);
closedir $dh;
foreach (@rec_dirs) {
my $rec_id = $_;
opendir my $dh, "$data_base/$dataset/wav/$spkr_id/$rec_id/" or die "Cannot open directory: $!";
my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
closedir $dh;
foreach (@files) {
my $name = $_;
my $wav = "$data_base/$dataset/wav/$spkr_id/$rec_id/$name.wav";
my $utt_id = "$spkr_id-$rec_id-$name";
print WAV_TEST "$utt_id", " $wav", "\n";
print SPKR_TEST "$utt_id", " $spkr_id", "\n";
}
}
}
close(SPKR_TEST) or die;
close(WAV_TEST) or die;
close(TRIAL_OUT) or die;
close(TRIAL_IN) or die;
}

if (system(
"utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
die "Error creating spk2utt file in directory $out_dir";
}
system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir");
if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
die "Error validating directory $out_dir";
}
11 changes: 7 additions & 4 deletions egs/voxceleb/v1/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ set -e
mfccdir=`pwd`/mfcc
vaddir=`pwd`/mfcc

# The trials file is downloaded by local/make_voxceleb1.pl.
# The trials file is downloaded by local/make_voxceleb1_v2.pl.
voxceleb1_trials=data/voxceleb1_test/trials
voxceleb1_root=/export/corpora/VoxCeleb1
voxceleb2_root=/export/corpora/VoxCeleb2
Expand All @@ -24,11 +24,14 @@ stage=0
if [ $stage -le 0 ]; then
local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train
local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test
# This script reates data/voxceleb1_test and data/voxceleb1_train.
# This script creates data/voxceleb1_test and data/voxceleb1_train for latest version of VoxCeleb1.
# Our evaluation set is the test portion of VoxCeleb1.
local/make_voxceleb1.pl $voxceleb1_root data
local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train
local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test
# if you downloaded the dataset soon after it was released, you will want to use the make_voxceleb1.pl script instead.
# local/make_voxceleb1.pl $voxceleb1_root data
# We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1.
# This should give 7,351 speakers and 1,277,503 utterances.
# This should give 7,323 speakers and 1,276,888 utterances.
utils/combine_data.sh data/train data/voxceleb2_train data/voxceleb2_test data/voxceleb1_train
fi

Expand Down
11 changes: 7 additions & 4 deletions egs/voxceleb/v2/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ mfccdir=`pwd`/mfcc
vaddir=`pwd`/mfcc


# The trials file is downloaded by local/make_voxceleb1.pl.
# The trials file is downloaded by local/make_voxceleb1_v2.pl.
voxceleb1_trials=data/voxceleb1_test/trials
voxceleb1_root=/export/corpora/VoxCeleb1
voxceleb2_root=/export/corpora/VoxCeleb2
Expand All @@ -27,11 +27,14 @@ stage=0
if [ $stage -le 0 ]; then
local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train
local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test
# This script creates data/voxceleb1_test and data/voxceleb1_train.
# This script creates data/voxceleb1_test and data/voxceleb1_train for latest version of VoxCeleb1.
# Our evaluation set is the test portion of VoxCeleb1.
local/make_voxceleb1.pl $voxceleb1_root data
local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train
local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test
# if you downloaded the dataset soon after it was released, you will want to use the make_voxceleb1.pl script instead.
# local/make_voxceleb1.pl $voxceleb1_root data
# We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1.
# This should give 7,351 speakers and 1,277,503 utterances.
# This should give 7,323 speakers and 1,276,888 utterances.
utils/combine_data.sh data/train data/voxceleb2_train data/voxceleb2_test data/voxceleb1_train
fi

Expand Down

0 comments on commit c3260f2

Please sign in to comment.