From c3260f242f0f331d5131dbd7ac9835f391756e06 Mon Sep 17 00:00:00 2001 From: Soonshin Seo Date: Sat, 20 Apr 2019 04:38:37 +0900 Subject: [PATCH] [egs] Make voxceleb recipe work with latest version of the dataset (#3249) --- egs/voxceleb/v1/local/make_voxceleb1_v2.pl | 123 +++++++++++++++++++++ egs/voxceleb/v1/run.sh | 11 +- egs/voxceleb/v2/run.sh | 11 +- 3 files changed, 137 insertions(+), 8 deletions(-) create mode 100755 egs/voxceleb/v1/local/make_voxceleb1_v2.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1_v2.pl b/egs/voxceleb/v1/local/make_voxceleb1_v2.pl new file mode 100755 index 00000000000..905b43d31a6 --- /dev/null +++ b/egs/voxceleb/v1/local/make_voxceleb1_v2.pl @@ -0,0 +1,123 @@ +#!/usr/bin/perl +# +# Copyright 2018 Ewald Enzinger +# 2018 David Snyder +# 2019 Soonshin Seo +# +# Usage: make_voxceleb1_v2.pl /export/voxceleb1 dev data/dev +# +# The VoxCeleb1 corpus underwent several updates that changed the directory and speaker ID format. +# The script 'make_voxceleb1.pl' works for the oldest version of the corpus. +# This script should be used if you've downloaded the corpus recently. + +if (@ARGV != 3) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/voxceleb1 dev data/dev\n"; + exit(1); +} + +($data_base, $dataset, $out_dir) = @ARGV; + +if ("$dataset" ne "dev" && "$dataset" ne "test") { + die "dataset parameter must be 'dev' or 'test'!"; +} + +if (system("mkdir -p $out_dir") != 0) { + die "Error making directory $out_dir"; +} + +opendir my $dh, "$data_base/$dataset/wav" or die "Cannot open directory: $!"; +my @spkr_dirs = grep {-d "$data_base/$dataset/wav/$_" && ! /^\.{1,2}$/} readdir($dh); +closedir $dh; + +if ($dataset eq "dev"){ + open(SPKR_TRAIN, ">", "$out_dir/utt2spk") or die "could not open the output file $out_dir/utt2spk"; + open(WAV_TRAIN, ">", "$out_dir/wav.scp") or die "could not open the output file $out_dir/wav.scp"; + + foreach (@spkr_dirs) { + my $spkr_id = $_; + opendir my $dh, "$data_base/$dataset/wav/$spkr_id/" or die "Cannot open directory: $!"; + my @rec_dirs = grep {-d "$data_base/$dataset/wav/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh); + closedir $dh; + foreach (@rec_dirs) { + my $rec_id = $_; + opendir my $dh, "$data_base/$dataset/wav/$spkr_id/$rec_id/" or die "Cannot open directory: $!"; + my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); + closedir $dh; + foreach (@files) { + my $name = $_; + my $wav = "$data_base/$dataset/wav/$spkr_id/$rec_id/$name.wav"; + my $utt_id = "$spkr_id-$rec_id-$name"; + print WAV_TRAIN "$utt_id", " $wav", "\n"; + print SPKR_TRAIN "$utt_id", " $spkr_id", "\n"; + } + } + } + close(SPKR_TRAIN) or die; + close(WAV_TRAIN) or die; +} + +if ($dataset eq "test"){ + if (! -e "$data_base/voxceleb1_test_v2.txt") { + system("wget -O $data_base/voxceleb1_test_v2.txt http://www.openslr.org/resources/49/voxceleb1_test_v2.txt"); + } + + open(TRIAL_IN, "<", "$data_base/voxceleb1_test_v2.txt") or die "could not open the verification trials file $data_base/voxceleb1_test_v2.txt"; + open(TRIAL_OUT, ">", "$out_dir/trials") or die "Could not open the output file $out_test_dir/trials"; + open(SPKR_TEST, ">", "$out_dir/utt2spk") or die "could not open the output file $out_dir/utt2spk"; + open(WAV_TEST, ">", "$out_dir/wav.scp") or die "could not open the output file $out_dir/wav.scp"; + + my $test_spkrs = (); + while () { + chomp; + my ($tar_or_non, $path1, $path2) = split; + # Create entry for left-hand side of trial + my ($spkr_id, $rec_id, $name) = split('/', $path1); + my $utt_id1 = "$spkr_id-$rec_id-$name"; + $test_spkrs{$spkr_id} = (); + + # Create entry for right-hand side of trial + my ($spkr_id, $rec_id, $name) = split('/', $path2); + my $utt_id2 = "$spkr_id-$rec_id-$name"; + $test_spkrs{$spkr_id} = (); + + my $target = "nontarget"; + if ($tar_or_non eq "1") { + $target = "target"; + } + print TRIAL_OUT "$utt_id1 $utt_id2 $target\n"; + } + + foreach (@spkr_dirs) { + my $spkr_id = $_; + opendir my $dh, "$data_base/$dataset/wav/$spkr_id/" or die "Cannot open directory: $!"; + my @rec_dirs = grep {-d "$data_base/$dataset/wav/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh); + closedir $dh; + foreach (@rec_dirs) { + my $rec_id = $_; + opendir my $dh, "$data_base/$dataset/wav/$spkr_id/$rec_id/" or die "Cannot open directory: $!"; + my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); + closedir $dh; + foreach (@files) { + my $name = $_; + my $wav = "$data_base/$dataset/wav/$spkr_id/$rec_id/$name.wav"; + my $utt_id = "$spkr_id-$rec_id-$name"; + print WAV_TEST "$utt_id", " $wav", "\n"; + print SPKR_TEST "$utt_id", " $spkr_id", "\n"; + } + } + } + close(SPKR_TEST) or die; + close(WAV_TEST) or die; + close(TRIAL_OUT) or die; + close(TRIAL_IN) or die; +} + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/voxceleb/v1/run.sh b/egs/voxceleb/v1/run.sh index 8af2226423d..500c05c5db6 100755 --- a/egs/voxceleb/v1/run.sh +++ b/egs/voxceleb/v1/run.sh @@ -14,7 +14,7 @@ set -e mfccdir=`pwd`/mfcc vaddir=`pwd`/mfcc -# The trials file is downloaded by local/make_voxceleb1.pl. +# The trials file is downloaded by local/make_voxceleb1_v2.pl. voxceleb1_trials=data/voxceleb1_test/trials voxceleb1_root=/export/corpora/VoxCeleb1 voxceleb2_root=/export/corpora/VoxCeleb2 @@ -24,11 +24,14 @@ stage=0 if [ $stage -le 0 ]; then local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test - # This script reates data/voxceleb1_test and data/voxceleb1_train. + # This script creates data/voxceleb1_test and data/voxceleb1_train for latest version of VoxCeleb1. # Our evaluation set is the test portion of VoxCeleb1. - local/make_voxceleb1.pl $voxceleb1_root data + local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train + local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test + # if you downloaded the dataset soon after it was released, you will want to use the make_voxceleb1.pl script instead. + # local/make_voxceleb1.pl $voxceleb1_root data # We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1. - # This should give 7,351 speakers and 1,277,503 utterances. + # This should give 7,323 speakers and 1,276,888 utterances. utils/combine_data.sh data/train data/voxceleb2_train data/voxceleb2_test data/voxceleb1_train fi diff --git a/egs/voxceleb/v2/run.sh b/egs/voxceleb/v2/run.sh index 37bb60fe35c..44340873a80 100755 --- a/egs/voxceleb/v2/run.sh +++ b/egs/voxceleb/v2/run.sh @@ -15,7 +15,7 @@ mfccdir=`pwd`/mfcc vaddir=`pwd`/mfcc -# The trials file is downloaded by local/make_voxceleb1.pl. +# The trials file is downloaded by local/make_voxceleb1_v2.pl. voxceleb1_trials=data/voxceleb1_test/trials voxceleb1_root=/export/corpora/VoxCeleb1 voxceleb2_root=/export/corpora/VoxCeleb2 @@ -27,11 +27,14 @@ stage=0 if [ $stage -le 0 ]; then local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test - # This script creates data/voxceleb1_test and data/voxceleb1_train. + # This script creates data/voxceleb1_test and data/voxceleb1_train for latest version of VoxCeleb1. # Our evaluation set is the test portion of VoxCeleb1. - local/make_voxceleb1.pl $voxceleb1_root data + local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train + local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test + # if you downloaded the dataset soon after it was released, you will want to use the make_voxceleb1.pl script instead. + # local/make_voxceleb1.pl $voxceleb1_root data # We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1. - # This should give 7,351 speakers and 1,277,503 utterances. + # This should give 7,323 speakers and 1,276,888 utterances. utils/combine_data.sh data/train data/voxceleb2_train data/voxceleb2_test data/voxceleb1_train fi