From c3260f242f0f331d5131dbd7ac9835f391756e06 Mon Sep 17 00:00:00 2001
From: Soonshin Seo <ssseo@sogang.ac.kr>
Date: Sat, 20 Apr 2019 04:38:37 +0900
Subject: [PATCH] [egs] Make voxceleb recipe work with latest version of the
 dataset  (#3249)

---
 egs/voxceleb/v1/local/make_voxceleb1_v2.pl | 123 +++++++++++++++++++++
 egs/voxceleb/v1/run.sh                     |  11 +-
 egs/voxceleb/v2/run.sh                     |  11 +-
 3 files changed, 137 insertions(+), 8 deletions(-)
 create mode 100755 egs/voxceleb/v1/local/make_voxceleb1_v2.pl
diff --git a/egs/voxceleb/v1/local/make_voxceleb1_v2.pl b/egs/voxceleb/v1/local/make_voxceleb1_v2.pl
new file mode 100755
index 00000000000..905b43d31a6
--- /dev/null
+++ b/egs/voxceleb/v1/local/make_voxceleb1_v2.pl
@@ -0,0 +1,123 @@
+#!/usr/bin/perl
+#
+# Copyright 2018  Ewald Enzinger
+#           2018  David Snyder
+#           2019  Soonshin Seo
+#
+# Usage: make_voxceleb1_v2.pl /export/voxceleb1 dev data/dev
+#
+# The VoxCeleb1 corpus underwent several updates that changed the directory and speaker ID format.
+# The script 'make_voxceleb1.pl' works for the oldest version of the corpus. 
+# This script should be used if you've downloaded the corpus recently.
+
+if (@ARGV != 3) {
+  print STDERR "Usage: $0 <path-to-voxceleb1> <dataset> <path-to-data-dir>\n";
+  print STDERR "e.g. $0 /export/voxceleb1 dev data/dev\n";
+  exit(1);
+}
+
+($data_base, $dataset, $out_dir) = @ARGV;
+
+if ("$dataset" ne "dev" && "$dataset" ne "test") {
+  die "dataset parameter must be 'dev' or 'test'!";
+}
+
+if (system("mkdir -p $out_dir") != 0) {
+  die "Error making directory $out_dir";
+}
+
+opendir my $dh, "$data_base/$dataset/wav" or die "Cannot open directory: $!";
+my @spkr_dirs = grep {-d "$data_base/$dataset/wav/$_" && ! /^\.{1,2}$/} readdir($dh);
+closedir $dh;
+
+if ($dataset eq "dev"){
+  open(SPKR_TRAIN, ">", "$out_dir/utt2spk") or die "could not open the output file $out_dir/utt2spk";
+  open(WAV_TRAIN, ">", "$out_dir/wav.scp") or die "could not open the output file $out_dir/wav.scp";
+
+  foreach (@spkr_dirs) {
+    my $spkr_id = $_;
+    opendir my $dh, "$data_base/$dataset/wav/$spkr_id/" or die "Cannot open directory: $!";
+    my @rec_dirs = grep {-d "$data_base/$dataset/wav/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh);
+    closedir $dh;
+    foreach (@rec_dirs) {
+	  my $rec_id = $_;
+	  opendir my $dh, "$data_base/$dataset/wav/$spkr_id/$rec_id/" or die "Cannot open directory: $!";
+	  my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
+	  closedir $dh;
+  	  foreach (@files) {
+        my $name = $_;
+        my $wav = "$data_base/$dataset/wav/$spkr_id/$rec_id/$name.wav";
+        my $utt_id = "$spkr_id-$rec_id-$name";
+        print WAV_TRAIN "$utt_id", " $wav", "\n";
+        print SPKR_TRAIN "$utt_id", " $spkr_id", "\n";
+      }
+    }
+  }
+  close(SPKR_TRAIN) or die;
+  close(WAV_TRAIN) or die;
+}
+
+if ($dataset eq "test"){
+  if (! -e "$data_base/voxceleb1_test_v2.txt") {
+    system("wget -O $data_base/voxceleb1_test_v2.txt http://www.openslr.org/resources/49/voxceleb1_test_v2.txt");
+  }
+
+  open(TRIAL_IN, "<", "$data_base/voxceleb1_test_v2.txt") or die "could not open the verification trials file $data_base/voxceleb1_test_v2.txt";
+  open(TRIAL_OUT, ">", "$out_dir/trials") or die "Could not open the output file $out_test_dir/trials";
+  open(SPKR_TEST, ">", "$out_dir/utt2spk") or die "could not open the output file $out_dir/utt2spk";
+  open(WAV_TEST, ">", "$out_dir/wav.scp") or die "could not open the output file $out_dir/wav.scp";
+
+  my $test_spkrs = ();
+  while (<TRIAL_IN>) {
+    chomp;
+    my ($tar_or_non, $path1, $path2) = split;
+    # Create entry for left-hand side of trial
+    my ($spkr_id, $rec_id, $name) = split('/', $path1);
+    my $utt_id1 = "$spkr_id-$rec_id-$name";
+    $test_spkrs{$spkr_id} = ();
+
+    # Create entry for right-hand side of trial
+    my ($spkr_id, $rec_id, $name) = split('/', $path2);
+    my $utt_id2 = "$spkr_id-$rec_id-$name";
+    $test_spkrs{$spkr_id} = ();
+
+    my $target = "nontarget";
+    if ($tar_or_non eq "1") {
+      $target = "target";
+    }
+    print TRIAL_OUT "$utt_id1 $utt_id2 $target\n";
+  }
+
+  foreach (@spkr_dirs) {
+    my $spkr_id = $_;
+    opendir my $dh, "$data_base/$dataset/wav/$spkr_id/" or die "Cannot open directory: $!";
+    my @rec_dirs = grep {-d "$data_base/$dataset/wav/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh);
+    closedir $dh;
+    foreach (@rec_dirs) {
+	  my $rec_id = $_;
+	  opendir my $dh, "$data_base/$dataset/wav/$spkr_id/$rec_id/" or die "Cannot open directory: $!";
+	  my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
+	  closedir $dh;
+  	  foreach (@files) {
+        my $name = $_;
+        my $wav = "$data_base/$dataset/wav/$spkr_id/$rec_id/$name.wav";
+        my $utt_id = "$spkr_id-$rec_id-$name";
+        print WAV_TEST "$utt_id", " $wav", "\n";
+        print SPKR_TEST "$utt_id", " $spkr_id", "\n";
+      }
+    }
+  }
+  close(SPKR_TEST) or die;
+  close(WAV_TEST) or die;
+  close(TRIAL_OUT) or die;
+  close(TRIAL_IN) or die;
+}
+
+if (system(
+  "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
+  die "Error creating spk2utt file in directory $out_dir";
+}
+system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir");
+if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
+  die "Error validating directory $out_dir";
+}
diff --git a/egs/voxceleb/v1/run.sh b/egs/voxceleb/v1/run.sh
index 8af2226423d..500c05c5db6 100755
--- a/egs/voxceleb/v1/run.sh
+++ b/egs/voxceleb/v1/run.sh
@@ -14,7 +14,7 @@ set -e
 mfccdir=`pwd`/mfcc
 vaddir=`pwd`/mfcc
 
-# The trials file is downloaded by local/make_voxceleb1.pl.
+# The trials file is downloaded by local/make_voxceleb1_v2.pl.
 voxceleb1_trials=data/voxceleb1_test/trials
 voxceleb1_root=/export/corpora/VoxCeleb1
 voxceleb2_root=/export/corpora/VoxCeleb2
@@ -24,11 +24,14 @@ stage=0
 if [ $stage -le 0 ]; then
   local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train
   local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test
-  # This script reates data/voxceleb1_test and data/voxceleb1_train.
+  # This script creates data/voxceleb1_test and data/voxceleb1_train for latest version of VoxCeleb1.
   # Our evaluation set is the test portion of VoxCeleb1.
-  local/make_voxceleb1.pl $voxceleb1_root data
+  local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train
+  local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test
+  # if you downloaded the dataset soon after it was released, you will want to use the make_voxceleb1.pl script instead.
+  # local/make_voxceleb1.pl $voxceleb1_root data
   # We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1.
-  # This should give 7,351 speakers and 1,277,503 utterances.
+  # This should give 7,323 speakers and 1,276,888 utterances.
   utils/combine_data.sh data/train data/voxceleb2_train data/voxceleb2_test data/voxceleb1_train
 fi
 
diff --git a/egs/voxceleb/v2/run.sh b/egs/voxceleb/v2/run.sh
index 37bb60fe35c..44340873a80 100755
--- a/egs/voxceleb/v2/run.sh
+++ b/egs/voxceleb/v2/run.sh
@@ -15,7 +15,7 @@ mfccdir=`pwd`/mfcc
 vaddir=`pwd`/mfcc
 
 
-# The trials file is downloaded by local/make_voxceleb1.pl.
+# The trials file is downloaded by local/make_voxceleb1_v2.pl.
 voxceleb1_trials=data/voxceleb1_test/trials
 voxceleb1_root=/export/corpora/VoxCeleb1
 voxceleb2_root=/export/corpora/VoxCeleb2
@@ -27,11 +27,14 @@ stage=0
 if [ $stage -le 0 ]; then
   local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train
   local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test
-  # This script creates data/voxceleb1_test and data/voxceleb1_train.
+  # This script creates data/voxceleb1_test and data/voxceleb1_train for latest version of VoxCeleb1.
   # Our evaluation set is the test portion of VoxCeleb1.
-  local/make_voxceleb1.pl $voxceleb1_root data
+  local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train
+  local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test 
+  # if you downloaded the dataset soon after it was released, you will want to use the make_voxceleb1.pl script instead.
+  # local/make_voxceleb1.pl $voxceleb1_root data
   # We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1.
-  # This should give 7,351 speakers and 1,277,503 utterances.
+  # This should give 7,323 speakers and 1,276,888 utterances.
   utils/combine_data.sh data/train data/voxceleb2_train data/voxceleb2_test data/voxceleb1_train
 fi