Merge branch 'main' into t5_lm_adaptation

NVIDIA · Feb 16, 2022 · 7d1626f · 7d1626f
2 parents 69acc37 + b5012d0
commit 7d1626f
Show file tree

Hide file tree

Showing 10 changed files with 19 additions and 16 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -81,9 +81,9 @@ RUN --mount=from=nemo-src,target=/tmp/nemo cd /tmp/nemo && pip install ".[all]"
     python -c "import nemo.collections.tts as nemo_tts" && \
     python -c "import nemo_text_processing.text_normalization as text_normalization"
 
-# TODO: Try to remove once 21.07 container is the base container
+# TODO: Update to newer numba 0.56.0RC1 for 22.02 container
 # install pinned numba version
-RUN conda install -c conda-forge numba=0.54.1
+# RUN conda install -c conda-forge numba==0.54.1
 
 # copy scripts/examples/tests into container for end user
 WORKDIR /workspace/nemo

diff --git a/nemo/collections/asr/parts/preprocessing/features.py b/nemo/collections/asr/parts/preprocessing/features.py
@@ -169,8 +169,8 @@ def inverse(self, magnitude, phase):
 
         if self.window is not None:
             window_sum = librosa.filters.window_sumsquare(
-                self.window,
-                magnitude.size(-1),
+                window=self.window,
+                n_frames=magnitude.size(-1),
                 hop_length=self.hop_length,
                 win_length=self.win_length,
                 n_fft=self.filter_length,
@@ -302,7 +302,8 @@ def __init__(
         highfreq = highfreq or sample_rate / 2
 
         filterbanks = torch.tensor(
-            librosa.filters.mel(sample_rate, self.n_fft, n_mels=nfilt, fmin=lowfreq, fmax=highfreq), dtype=torch.float
+            librosa.filters.mel(sr=sample_rate, n_fft=self.n_fft, n_mels=nfilt, fmin=lowfreq, fmax=highfreq),
+            dtype=torch.float,
         ).unsqueeze(0)
         self.register_buffer("fb", filterbanks)
 

diff --git a/nemo/collections/asr/parts/preprocessing/perturb.py b/nemo/collections/asr/parts/preprocessing/perturb.py
@@ -162,7 +162,9 @@ def perturb(self, data):
             return
 
         new_sr = int(self._sr * speed_rate)
-        data._samples = librosa.core.resample(data._samples, self._sr, new_sr, res_type=self._res_type)
+        data._samples = librosa.core.resample(
+            data._samples, orig_sr=self._sr, target_sr=new_sr, res_type=self._res_type
+        )
 
 
 class TimeStretchPerturbation(Perturbation):

diff --git a/nemo/collections/asr/parts/preprocessing/segment.py b/nemo/collections/asr/parts/preprocessing/segment.py
@@ -72,10 +72,10 @@ def __init__(self, samples, sample_rate, target_sr=None, trim=False, trim_db=60,
         """
         samples = self._convert_samples_to_float32(samples)
         if target_sr is not None and target_sr != sample_rate:
-            samples = librosa.core.resample(samples, sample_rate, target_sr)
+            samples = librosa.core.resample(samples, orig_sr=sample_rate, target_sr=target_sr)
             sample_rate = target_sr
         if trim:
-            samples, _ = librosa.effects.trim(samples, trim_db)
+            samples, _ = librosa.effects.trim(samples, top_db=trim_db)
         self._samples = samples
         self._sample_rate = sample_rate
         if self._samples.ndim >= 2:

diff --git a/nemo/collections/tts/data/datalayers.py b/nemo/collections/tts/data/datalayers.py
@@ -459,7 +459,7 @@ def setup_noise_augmented_dataset(files_list, num_snr, kwargs_stft, dest, desc):
         for line in list_file_pbar:
             audio_file = line.split('|')[0]
             speech = sf.read(audio_file)[0].astype(np.float32)
-            spec_clean = np.ascontiguousarray(librosa.stft(speech, **kwargs_stft))
+            spec_clean = np.ascontiguousarray(librosa.stft(y=speech, **kwargs_stft))
             mag_clean = np.ascontiguousarray(np.abs(spec_clean)[..., np.newaxis])
             signal_power = np.mean(np.abs(speech) ** 2)
 
@@ -472,7 +472,7 @@ def setup_noise_augmented_dataset(files_list, num_snr, kwargs_stft, dest, desc):
                 snr = librosa.db_to_power(snr_db)
                 noise_power = signal_power / snr
                 noisy = speech + np.sqrt(noise_power) * np.random.randn(len(speech))
-                spec_noisy = librosa.stft(noisy, **kwargs_stft)
+                spec_noisy = librosa.stft(y=noisy, **kwargs_stft)
                 spec_noisy = np.ascontiguousarray(spec_noisy)
                 T_x = spec_noisy.shape[1]
                 x = spec_noisy.view(dtype=np.float32).reshape((*spec_noisy.shape, 2))

diff --git a/nemo/collections/tts/models/degli.py b/nemo/collections/tts/models/degli.py
@@ -94,7 +94,7 @@ def reconstruct_wave(*args: ndarray, kwargs_istft, n_sample=-1) -> ndarray:
     if spec is None:
         spec = mag * np.exp(1j * phase)
 
-    wave = librosa.istft(spec, **kwargs_istft, **kwarg_len)
+    wave = librosa.istft(stft_matrix=spec, **kwargs_istft, **kwarg_len)
     return wave
 
 

diff --git a/nemo/collections/tts/torch/data.py b/nemo/collections/tts/torch/data.py
@@ -226,7 +226,7 @@ def __init__(
         self.hop_len = self.hop_length or self.n_fft // 4
         self.fb = torch.tensor(
             librosa.filters.mel(
-                self.sample_rate, self.n_fft, n_mels=self.n_mels, fmin=self.lowfreq, fmax=self.highfreq
+                sr=self.sample_rate, n_fft=self.n_fft, n_mels=self.n_mels, fmin=self.lowfreq, fmax=self.highfreq
             ),
             dtype=torch.float,
         ).unsqueeze(0)

diff --git a/scripts/dataset_processing/process_vad_data.py b/scripts/dataset_processing/process_vad_data.py
@@ -184,7 +184,7 @@ def write_manifest(
 
             try:
                 x, _sr = librosa.load(file, sr=sr)
-                duration = librosa.get_duration(x, sr=sr)
+                duration = librosa.get_duration(y=x, sr=sr)
 
             except Exception:
                 continue
@@ -312,7 +312,7 @@ def generate_variety_noise(data_dir, filename, prefix):
         files = allfile.read().splitlines()
 
     for file in files:
-        y, sr = librosa.load(file, sr=sampling_rate)
+        y, sr = librosa.load(path=file, sr=sampling_rate)
 
         for i in range(
             0, len(y) - sampling_rate, silence_stride * 100

diff --git a/scripts/freesound_download_resample/freesound_download.py b/scripts/freesound_download_resample/freesound_download.py
@@ -324,7 +324,7 @@ def download_song(basepath, id, name, download_url):
     # Delete and then re-download
     if os.path.exists(fp):
         try:
-            _ = librosa.load(fp)
+            _ = librosa.load(path=fp)
         except Exception:
             # File is currupted, delete and re-download.
             os.remove(fp)

diff --git a/scripts/freesound_download_resample/freesound_resample.py b/scripts/freesound_download_resample/freesound_resample.py
@@ -64,7 +64,7 @@ def resample_file(resampled_dir, filepath, ext, sample_rate):
 
         try:
             # Check if the file is readable
-            librosa.load(filepath)
+            librosa.load(path=filepath)
 
             # if it is, force input format and try again
             transform.set_input_format(file_type=ext)