[FIX] replace tqdm with prograssbar [ADD] separate speech/noise mixin…

…g, add option to mix multi noise into one audio [MOD] change FLAGS name, gla iterations is optional
mychiux413 · Mar 31, 2020 · f7d1279 · f7d1279
1 parent 42bc45b
commit f7d1279
Show file tree

Hide file tree

Showing 6 changed files with 140 additions and 73 deletions.
diff --git a/DeepSpeech.py b/DeepSpeech.py
@@ -431,7 +431,8 @@ def train():
             FLAGS.augmentation_freq_and_time_masking or
             FLAGS.augmentation_pitch_and_tempo_scaling or
             FLAGS.augmentation_speed_up_std > 0 or
-            FLAGS.train_augmentation_files):
+            FLAGS.train_augmentation_noise_files or
+            FLAGS.train_augmentation_speech_files):
         do_cache_dataset = False
 
     # Create training and validation datasets
@@ -440,7 +441,8 @@ def train():
                                enable_cache=FLAGS.feature_cache and do_cache_dataset,
                                cache_path=FLAGS.feature_cache,
                                train_phase=True,
-                               noise_sources=FLAGS.train_augmentation_files)
+                               noise_sources=FLAGS.train_augmentation_noise_files,
+                               speech_sources=FLAGS.train_augmentation_speech_files)
 
     iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(train_set),
                                                  tfv1.data.get_output_shapes(train_set),
@@ -451,7 +453,7 @@ def train():
 
     if FLAGS.dev_files:
         dev_csvs = FLAGS.dev_files.split(',')
-        dev_sets = [create_dataset([csv], batch_size=FLAGS.dev_batch_size, train_phase=False, noise_sources=FLAGS.dev_augmentation_files) for csv in dev_csvs]
+        dev_sets = [create_dataset([csv], batch_size=FLAGS.dev_batch_size, train_phase=False, noise_sources=FLAGS.dev_augmentation_noise_files, speech_sources=FLAGS.dev_augmentation_speech_files) for csv in dev_csvs]
         dev_init_ops = [iterator.make_initializer(dev_set) for dev_set in dev_sets]
 
     # Dropout
@@ -690,7 +692,7 @@ def __call__(self, progress, data, **kwargs):
 
 
 def test():
-    samples = evaluate(FLAGS.test_files.split(','), create_model, try_loading, noise_sources=FLAGS.test_augmentation_files)
+    samples = evaluate(FLAGS.test_files.split(','), create_model, try_loading, noise_sources=FLAGS.test_augmentation_noise_files, speech_sources=FLAGS.test_augmentation_speech_files)
     if FLAGS.test_output_file:
         # Save decoded tuples as JSON, converting NumPy floats to Python floats
         json.dump(samples, open(FLAGS.test_output_file, 'w'), default=float)

diff --git a/bin/normalize_noise_audio.py b/bin/normalize_noise_audio.py
@@ -3,21 +3,17 @@
 # Make sure we can import stuff from util/
 # This script needs to be run from the root of the DeepSpeech repository
 
-from util.feeding import secs_to_hours
 from librosa import get_duration
 from multiprocessing import Pool
 from functools import partial
 import math
 import argparse
 import sys
 import os
+import progressbar
 sys.path.insert(1, os.path.join(sys.path[0], '..'))
 
-try:
-    import tqdm
-except ImportError as err:
-    print('[ImportError] try `pip install tqdm`')
-    raise err
+from util.feeding import secs_to_hours
 
 try:
     from pydub import AudioSegment
@@ -152,8 +148,10 @@ def main(src_dir,
                                max_duration_seconds=max_duration_seconds)
 
         pool = Pool(processes=None)
-        for _ in tqdm.tqdm(pool.imap_unordered(convert_func, filenames), total=len(filenames)):
-            pass
+        pbar = progressbar.ProgressBar(prefix='Preparing Noise Dataset', max_value=len(filenames)).start()
+        for i, _ in enumerate(pool.imap_unordered(convert_func, filenames)):
+            pbar.update(i)
+        pbar.finish()
 
 
 if __name__ == "__main__":

diff --git a/evaluate.py b/evaluate.py
@@ -41,7 +41,7 @@ def sparse_tuple_to_texts(sp_tuple, alphabet):
     return [alphabet.decode(res) for res in results]
 
 
-def evaluate(test_csvs, create_model, try_loading, noise_sources=None):
+def evaluate(test_csvs, create_model, try_loading, noise_sources=None, speech_sources=None):
     if FLAGS.lm_binary_path:
         scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta,
                         FLAGS.lm_binary_path, FLAGS.lm_trie_path,
@@ -50,7 +50,7 @@ def evaluate(test_csvs, create_model, try_loading, noise_sources=None):
         scorer = None
 
     test_csvs = FLAGS.test_files.split(',')
-    test_sets = [create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False, noise_sources=noise_sources) for csv in test_csvs]
+    test_sets = [create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False, noise_sources=noise_sources, speech_sources=speech_sources) for csv in test_csvs]
     iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(test_sets[0]),
                                                  tfv1.data.get_output_shapes(test_sets[0]),
                                                  output_classes=tfv1.data.get_output_classes(test_sets[0]))

diff --git a/util/audio_augmentation.py b/util/audio_augmentation.py
@@ -109,13 +109,21 @@ def collect_noise_filenames(sources, read_csvs_func):
 
 
 def augment_noise(audio,
-                  noise,
-                  min_audio_dbfs=0.0,
-                  max_audio_dbfs=-35.0,
-                  min_snr_db=3.0,
-                  max_snr_db=30.0,
+                  noise_iterator=None,
+                  speech_iterator=None,
+                  min_n_noises=0,
+                  max_n_noises=1,
+                  min_n_speakers=0,
+                  max_n_speakers=1,
+                  min_audio_dbfs=-35.0,
+                  max_audio_dbfs=0.0,
+                  min_noise_snr_db=3.0,
+                  max_noise_snr_db=30.0,
+                  min_speech_snr_db=3.0,
+                  max_speech_snr_db=30.0,
                   limit_audio_peak_dbfs=7.0,
                   limit_noise_peak_dbfs=3.0,
+                  limit_speech_peak_dbfs=7.0,
                   sample_rate=16000):
     r"""Mix audio Tensor with noise Tensor
 
@@ -125,65 +133,106 @@ def augment_noise(audio,
 
     Args:
         audio: A 2-D Tensor with shape [`time-steps`, 1].
-        noise: A 2-D Tensor with shape [`time-steps`, 1].
+        noise_iterator: A one shot iterator for noise file, the yield item shape is [`time-steps`, 1].
+        speech_iterator: A one shot iterator for speech file, the yield item shape is [`time-steps`, 1].
+        min_n_noises: A int, min number of the noises per audio mixing
+        max_n_noises: A int, 'max number of the noises per audio mixing
+        min_n_speakers: A int, min number of the speakers per audio mixing
+        max_n_speakers: A int, max number of the speakers per audio mixing
         min_audio_dbfs: A float in dbfs unit, specifying the `minimum` volume of audio during gaining audio.
         max_audio_dbfs: A float in dbfs unit, specifying the `maximum` volume of audio during gaining audio.
-        min_snr_db: A float in db unit, specifying the minimum signal-to-noise ratio during gaining audio and noise.
-        max_snr_db: A float in db unit, specifying the maximum signal-to-noise ratio during gaining audio and noise.
-        limit_audio_peak_dbfs: A float, specifying the limitation of maximun audio dbfs of chunks, the audio volume will not gain over than the specified value.
-        limit_noise_peak_dbfs: A float, specifying the limitation of maximun noise dbfs of chunks, the noise volume will not gain over than the specified value.
+        min_noise_snr_db: A float in db unit, specifying the `minimum` signal-to-noise ratio during gaining noise.
+        max_noise_snr_db: A float in db unit, specifying the `maximum` signal-to-noise ratio during gaining noise.
+        min_speech_snr_db: A float in db unit, specifying the `minimum` signal-to-noise ratio during gaining speech.
+        max_speech_snr_db: A float in db unit, specifying the `maximum` signal-to-noise ratio during gaining speech.
+        limit_audio_peak_dbfs: A float, specifying the limitation of maximun `audio` dbfs of chunks, the audio volume will not gain over than the specified value.
+        limit_noise_peak_dbfs: A float, specifying the limitation of maximun `noise` dbfs of chunks, the noise volume will not gain over than the specified value.
+        limit_speech_peak_dbfs: A float, specifying the limitation of maximun `speech` dbfs of chunks, the noise volume will not gain over than the specified value.
         sample_rate: An integer, specifying the audio sample rate to determining the chunk size for dbfs measurement.
 
     Returns:
         A 2-D Tensor with shape [`time-steps`, 1]. Has the same type and shape as `audio`.
     """
 
     audio_len = tf.shape(audio)[0]
-    noise_len = tf.shape(noise)[0]
-
     audio_mean_dbfs, audio_max_dbfs = audio_to_dbfs(audio, sample_rate, reduce_funcs=[tf.reduce_mean, tf.reduce_max])
+    target_audio_dbfs = tfv1.random_uniform([], minval=min_audio_dbfs, maxval=max_audio_dbfs)
+    audio_gain_db = target_audio_dbfs - audio_mean_dbfs
+
+    # limit audio peak
+    audio_gain_db = tf.minimum(limit_audio_peak_dbfs - audio_max_dbfs, audio_gain_db)
+    target_audio_dbfs = audio_mean_dbfs + audio_gain_db
+    audio_gain_ratio = tf.math.pow(10.0, audio_gain_db / 20.0)
+    mixed_audio = tf.multiply(audio, audio_gain_ratio)
+
 
+    if noise_iterator:
+        n_noise = tfv1.random_uniform([], minval=min_n_noises, maxval=max_n_noises, dtype=tf.int32) if min_n_noises != max_n_noises else min_n_noises
+        def mix_noise_func(au):
+            noise = noise_iterator.get_next()
+            noise, noise_mean_dbfs, noise_max_dbfs = extract_noise(noise, audio_len, sample_rate)
+            return mix(au, target_audio_dbfs, noise, noise_mean_dbfs, noise_max_dbfs, min_noise_snr_db, max_noise_snr_db, limit_noise_peak_dbfs)
+        mixed_audio = tf.while_loop(lambda _: True, mix_noise_func, [mixed_audio], maximum_iterations=n_noise)
+
+    if speech_iterator:
+        n_speakers = tfv1.random_uniform([], minval=min_n_speakers, maxval=max_n_speakers, dtype=tf.int32) if min_n_speakers != max_n_speakers else min_n_speakers
+        def mix_speech_func(au):
+            speech = speech_iterator.get_next()
+            speech, speech_mean_dbfs, speech_max_dbfs = extract_noise(speech, audio_len, sample_rate)
+            return mix(au, target_audio_dbfs, speech, speech_mean_dbfs, speech_max_dbfs, min_speech_snr_db, max_speech_snr_db, limit_speech_peak_dbfs)
+        mixed_audio = tf.while_loop(lambda _: True, mix_speech_func, [mixed_audio], maximum_iterations=n_speakers)
+
+    mixed_audio = tf.maximum(tf.minimum(mixed_audio, 1.0), -1.0)
+
+    return mixed_audio
+
+def extract_noise(noise, audio_len, sample_rate=16000):
+    r"""to prepare the mixable noise file out
+
+    Args:
+        noise: A 2-D Tensor with shape [`time-steps`, 1]
+        audio_len: A tf.int32 scalar, the audio length
+        sample_rate: An integer, specifying the audio sample rate to determining the chunk size for dbfs measurement.
+    Returns:
+        A 2-D Tensor with shape [`audio_len`, 1].
+        A float, the extracted noise mean dbfs
+        A float, the extracted noise max dbfs
+    """
+    noise_len = tf.shape(noise)[0]
     multiply = tf.math.floordiv(audio_len, noise_len) + 1
     noise_tile = tf.tile(noise, [multiply, 1])
 
-
     # Now, noise_len must > audio_len
     noise_tile_len = tf.shape(noise_tile)[0]
 
     mix_decoded_start_point = tfv1.random_uniform([], minval=0, maxval=noise_tile_len-audio_len, dtype=tf.int32)
     mix_decoded_end_point = mix_decoded_start_point + audio_len
-    extract_noise = noise_tile[mix_decoded_start_point:mix_decoded_end_point, :]
-
-    extract_noise_mean_dbfs, extract_noise_max_dbfs = audio_to_dbfs(extract_noise, sample_rate, reduce_funcs=[tf.reduce_mean, tf.reduce_max])
-
-    target_audio_dbfs = tfv1.random_uniform([], minval=min_audio_dbfs, maxval=max_audio_dbfs)
+    extracted_noise = noise_tile[mix_decoded_start_point:mix_decoded_end_point, :]
+    extracted_noise_mean_dbfs, extracted_noise_max_dbfs = audio_to_dbfs(extracted_noise, sample_rate, reduce_funcs=[tf.reduce_mean, tf.reduce_max])
+    return extracted_noise, extracted_noise_mean_dbfs, extracted_noise_max_dbfs
 
-    audio_gain_db = target_audio_dbfs - audio_mean_dbfs
+def mix(audio, audio_dbfs, noise, noise_mean_dbfs, noise_max_dbfs, min_noise_snr_db, max_noise_snr_db, limit_noise_peak_dbfs):
+    r"""The input audio len must equal to noise len
 
-    # limit audio peak
-    audio_gain_db = tf.minimum(limit_audio_peak_dbfs - audio_max_dbfs, audio_gain_db)
-    target_audio_dbfs = audio_mean_dbfs + audio_gain_db
-
-    audio_gain_ratio = tf.math.pow(10.0, audio_gain_db / 20.0)
+    Returns:
+        A 2-D Tensor with shape [`time-steps`, 1]. Has the same type and shape as `audio`.
+    """
 
     # target_snr_db := target_audio_dbfs - target_noise_dbfs
-    target_snr_db = tfv1.random_uniform([], minval=min_snr_db, maxval=max_snr_db)
+    target_snr_db = tfv1.random_uniform([], minval=min_noise_snr_db, maxval=max_noise_snr_db)
 
-    target_noise_dbfs = target_audio_dbfs - target_snr_db
-    noise_gain_db = target_noise_dbfs - extract_noise_mean_dbfs
+    target_noise_dbfs = audio_dbfs - target_snr_db
+    noise_gain_db = target_noise_dbfs - noise_mean_dbfs
 
     # limit noise peak
-    noise_gain_db = tf.minimum(limit_noise_peak_dbfs - extract_noise_max_dbfs, noise_gain_db)
+    noise_gain_db = tf.minimum(limit_noise_peak_dbfs - noise_max_dbfs, noise_gain_db)
     noise_gain_ratio = tf.math.pow(10.0, noise_gain_db / 20.0)
 
-    mixed_audio = tf.multiply(audio, audio_gain_ratio) + tf.multiply(extract_noise, noise_gain_ratio)
-
-    mixed_audio = tf.maximum(tf.minimum(mixed_audio, 1.0), -1.0)
-
-    return mixed_audio
+    audio += tf.multiply(noise, noise_gain_ratio)
+    return audio
 
-def gla(spectrogram):
-    r"""Use Griffin-Lim algorithm to reconstruct audio and fix iteration=10 to not waste too much performance in prefetch
+def gla(spectrogram, n_iter=10):
+    r"""Use Griffin-Lim algorithm to reconstruct audio
 
     Args:
         spectrogram: A 3-D Tensor with shape [1, `time-steps`, `features`].
@@ -206,7 +255,7 @@ def reconstruct_phases(prev_phases):
     rands = tfv1.random_uniform(tf.shape(spectrogram), dtype=tf.float32)
     phases = tf.math.exp(tf.complex(0.0, 2.0 * np.pi * rands))
 
-    reconstructed_phases = tf.while_loop(lambda _: True, reconstruct_phases, [phases], maximum_iterations=10)
+    reconstructed_phases = tf.while_loop(lambda _: True, reconstruct_phases, [phases], maximum_iterations=n_iter)
     xi = tf.complex(abs_spectrogram, 0.0) * reconstructed_phases
     audio = tf.signal.inverse_stft(xi, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length)
     return tf.transpose(audio)
diff --git a/util/feeding.py b/util/feeding.py
@@ -68,28 +68,35 @@ def samples_to_mfccs(samples, sample_rate, train_phase=False):
                 FLAGS.augmentation_freq_and_time_masking,
                 FLAGS.augmentation_pitch_and_tempo_scaling,
                 FLAGS.augmentation_speed_up_std > 0]):
-        review_audio = gla(spectrogram)
+        review_audio = gla(spectrogram, FLAGS.review_audio_gla_iterations)
 
     return mfccs, tf.shape(input=mfccs)[0], review_audio
 
 
-def audiofile_to_features(wav_filename, train_phase=False, noise_iterator=None):
+def audiofile_to_features(wav_filename, train_phase=False, noise_iterator=None, speech_iterator=None):
     samples = tf.io.read_file(wav_filename)
     decoded = contrib_audio.decode_wav(samples, desired_channels=1)
     audio = decoded.audio
 
     # augment audio
-    if noise_iterator:
-        noise = noise_iterator.get_next()
+    if noise_iterator or speech_iterator:
         audio = augment_noise(
             audio,
-            noise,
+            noise_iterator,
+            speech_iterator,
+            min_n_noises=FLAGS.audio_aug_min_n_noises,
+            max_n_noises=FLAGS.audio_aug_max_n_noises,
+            min_n_speakers=FLAGS.audio_aug_min_n_speakers,
+            max_n_speakers=FLAGS.audio_aug_max_n_speakers,
             min_audio_dbfs=FLAGS.audio_aug_min_audio_dbfs,
             max_audio_dbfs=FLAGS.audio_aug_max_audio_dbfs,
-            min_snr_db=FLAGS.audio_aug_min_snr_db,
-            max_snr_db=FLAGS.audio_aug_max_snr_db,
+            min_noise_snr_db=FLAGS.audio_aug_min_noise_snr_db,
+            max_noise_snr_db=FLAGS.audio_aug_max_noise_snr_db,
+            min_speech_snr_db=FLAGS.audio_aug_min_speech_snr_db,
+            max_speech_snr_db=FLAGS.audio_aug_max_speech_snr_db,
             limit_audio_peak_dbfs=FLAGS.audio_aug_limit_audio_peak_dbfs,
             limit_noise_peak_dbfs=FLAGS.audio_aug_limit_noise_peak_dbfs,
+            limit_speech_peak_dbfs=FLAGS.audio_aug_limit_speech_peak_dbfs,
             sample_rate=FLAGS.audio_sample_rate,
         )
 
@@ -106,9 +113,9 @@ def audiofile_to_features(wav_filename, train_phase=False, noise_iterator=None):
     return features, features_len, review_audio
 
 
-def entry_to_features(wav_filename, transcript, train_phase, noise_iterator):
+def entry_to_features(wav_filename, transcript, train_phase, noise_iterator, speech_iterator):
     # https://bugs.python.org/issue32117
-    features, features_len, review_audio = audiofile_to_features(wav_filename, train_phase=train_phase, noise_iterator=noise_iterator)
+    features, features_len, review_audio = audiofile_to_features(wav_filename, train_phase=train_phase, noise_iterator=noise_iterator, speech_iterator=speech_iterator)
     return wav_filename, features, features_len, tf.SparseTensor(*transcript), review_audio
 
 
@@ -121,7 +128,7 @@ def to_sparse_tuple(sequence):
     return indices, sequence, shape
 
 
-def create_dataset(csvs, batch_size, enable_cache=False, cache_path=None, train_phase=False, noise_sources=None):
+def create_dataset(csvs, batch_size, enable_cache=False, cache_path=None, train_phase=False, noise_sources=None, speech_sources=None):
     df = read_csvs(csvs)
     df.sort_values(by='wav_filesize', inplace=True)
 
@@ -156,12 +163,11 @@ def batch_fn(wav_filenames, features, features_len, transcripts, review_audios):
 
     num_gpus = len(Config.available_devices)
 
-    if noise_sources:
-        noise_iterator = create_noise_iterator(noise_sources, read_csvs)
-    else:
-        noise_iterator = None
+    noise_iterator = create_noise_iterator(noise_sources, read_csvs) if noise_sources else None
+    speech_iterator = create_noise_iterator(speech_sources, read_csvs) if speech_sources else None
 
-    process_fn = partial(entry_to_features, train_phase=train_phase, noise_iterator=noise_iterator)
+
+    process_fn = partial(entry_to_features, train_phase=train_phase, noise_iterator=noise_iterator, speech_iterator=speech_iterator)
 
     dataset = tf.data.Dataset.from_generator(generate_values,
                                              output_types=(tf.string, (tf.int64, tf.int32, tf.int64)))