Skip to content

Commit

Permalink
[FIX] replace tqdm with prograssbar [ADD] separate speech/noise mixin…
Browse files Browse the repository at this point in the history
…g, add option to mix multi noise into one audio [MOD] change FLAGS name, gla iterations is optional
  • Loading branch information
mychiux413 committed Mar 31, 2020
1 parent 42bc45b commit f7d1279
Show file tree
Hide file tree
Showing 6 changed files with 140 additions and 73 deletions.
10 changes: 6 additions & 4 deletions DeepSpeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,8 @@ def train():
FLAGS.augmentation_freq_and_time_masking or
FLAGS.augmentation_pitch_and_tempo_scaling or
FLAGS.augmentation_speed_up_std > 0 or
FLAGS.train_augmentation_files):
FLAGS.train_augmentation_noise_files or
FLAGS.train_augmentation_speech_files):
do_cache_dataset = False

# Create training and validation datasets
Expand All @@ -440,7 +441,8 @@ def train():
enable_cache=FLAGS.feature_cache and do_cache_dataset,
cache_path=FLAGS.feature_cache,
train_phase=True,
noise_sources=FLAGS.train_augmentation_files)
noise_sources=FLAGS.train_augmentation_noise_files,
speech_sources=FLAGS.train_augmentation_speech_files)

iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(train_set),
tfv1.data.get_output_shapes(train_set),
Expand All @@ -451,7 +453,7 @@ def train():

if FLAGS.dev_files:
dev_csvs = FLAGS.dev_files.split(',')
dev_sets = [create_dataset([csv], batch_size=FLAGS.dev_batch_size, train_phase=False, noise_sources=FLAGS.dev_augmentation_files) for csv in dev_csvs]
dev_sets = [create_dataset([csv], batch_size=FLAGS.dev_batch_size, train_phase=False, noise_sources=FLAGS.dev_augmentation_noise_files, speech_sources=FLAGS.dev_augmentation_speech_files) for csv in dev_csvs]
dev_init_ops = [iterator.make_initializer(dev_set) for dev_set in dev_sets]

# Dropout
Expand Down Expand Up @@ -690,7 +692,7 @@ def __call__(self, progress, data, **kwargs):


def test():
samples = evaluate(FLAGS.test_files.split(','), create_model, try_loading, noise_sources=FLAGS.test_augmentation_files)
samples = evaluate(FLAGS.test_files.split(','), create_model, try_loading, noise_sources=FLAGS.test_augmentation_noise_files, speech_sources=FLAGS.test_augmentation_speech_files)
if FLAGS.test_output_file:
# Save decoded tuples as JSON, converting NumPy floats to Python floats
json.dump(samples, open(FLAGS.test_output_file, 'w'), default=float)
Expand Down
14 changes: 6 additions & 8 deletions bin/normalize_noise_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,17 @@
# Make sure we can import stuff from util/
# This script needs to be run from the root of the DeepSpeech repository

from util.feeding import secs_to_hours
from librosa import get_duration
from multiprocessing import Pool
from functools import partial
import math
import argparse
import sys
import os
import progressbar
sys.path.insert(1, os.path.join(sys.path[0], '..'))

try:
import tqdm
except ImportError as err:
print('[ImportError] try `pip install tqdm`')
raise err
from util.feeding import secs_to_hours

try:
from pydub import AudioSegment
Expand Down Expand Up @@ -152,8 +148,10 @@ def main(src_dir,
max_duration_seconds=max_duration_seconds)

pool = Pool(processes=None)
for _ in tqdm.tqdm(pool.imap_unordered(convert_func, filenames), total=len(filenames)):
pass
pbar = progressbar.ProgressBar(prefix='Preparing Noise Dataset', max_value=len(filenames)).start()
for i, _ in enumerate(pool.imap_unordered(convert_func, filenames)):
pbar.update(i)
pbar.finish()


if __name__ == "__main__":
Expand Down
4 changes: 2 additions & 2 deletions evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def sparse_tuple_to_texts(sp_tuple, alphabet):
return [alphabet.decode(res) for res in results]


def evaluate(test_csvs, create_model, try_loading, noise_sources=None):
def evaluate(test_csvs, create_model, try_loading, noise_sources=None, speech_sources=None):
if FLAGS.lm_binary_path:
scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta,
FLAGS.lm_binary_path, FLAGS.lm_trie_path,
Expand All @@ -50,7 +50,7 @@ def evaluate(test_csvs, create_model, try_loading, noise_sources=None):
scorer = None

test_csvs = FLAGS.test_files.split(',')
test_sets = [create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False, noise_sources=noise_sources) for csv in test_csvs]
test_sets = [create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False, noise_sources=noise_sources, speech_sources=speech_sources) for csv in test_csvs]
iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(test_sets[0]),
tfv1.data.get_output_shapes(test_sets[0]),
output_classes=tfv1.data.get_output_classes(test_sets[0]))
Expand Down
121 changes: 85 additions & 36 deletions util/audio_augmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,13 +109,21 @@ def collect_noise_filenames(sources, read_csvs_func):


def augment_noise(audio,
noise,
min_audio_dbfs=0.0,
max_audio_dbfs=-35.0,
min_snr_db=3.0,
max_snr_db=30.0,
noise_iterator=None,
speech_iterator=None,
min_n_noises=0,
max_n_noises=1,
min_n_speakers=0,
max_n_speakers=1,
min_audio_dbfs=-35.0,
max_audio_dbfs=0.0,
min_noise_snr_db=3.0,
max_noise_snr_db=30.0,
min_speech_snr_db=3.0,
max_speech_snr_db=30.0,
limit_audio_peak_dbfs=7.0,
limit_noise_peak_dbfs=3.0,
limit_speech_peak_dbfs=7.0,
sample_rate=16000):
r"""Mix audio Tensor with noise Tensor
Expand All @@ -125,65 +133,106 @@ def augment_noise(audio,
Args:
audio: A 2-D Tensor with shape [`time-steps`, 1].
noise: A 2-D Tensor with shape [`time-steps`, 1].
noise_iterator: A one shot iterator for noise file, the yield item shape is [`time-steps`, 1].
speech_iterator: A one shot iterator for speech file, the yield item shape is [`time-steps`, 1].
min_n_noises: A int, min number of the noises per audio mixing
max_n_noises: A int, 'max number of the noises per audio mixing
min_n_speakers: A int, min number of the speakers per audio mixing
max_n_speakers: A int, max number of the speakers per audio mixing
min_audio_dbfs: A float in dbfs unit, specifying the `minimum` volume of audio during gaining audio.
max_audio_dbfs: A float in dbfs unit, specifying the `maximum` volume of audio during gaining audio.
min_snr_db: A float in db unit, specifying the minimum signal-to-noise ratio during gaining audio and noise.
max_snr_db: A float in db unit, specifying the maximum signal-to-noise ratio during gaining audio and noise.
limit_audio_peak_dbfs: A float, specifying the limitation of maximun audio dbfs of chunks, the audio volume will not gain over than the specified value.
limit_noise_peak_dbfs: A float, specifying the limitation of maximun noise dbfs of chunks, the noise volume will not gain over than the specified value.
min_noise_snr_db: A float in db unit, specifying the `minimum` signal-to-noise ratio during gaining noise.
max_noise_snr_db: A float in db unit, specifying the `maximum` signal-to-noise ratio during gaining noise.
min_speech_snr_db: A float in db unit, specifying the `minimum` signal-to-noise ratio during gaining speech.
max_speech_snr_db: A float in db unit, specifying the `maximum` signal-to-noise ratio during gaining speech.
limit_audio_peak_dbfs: A float, specifying the limitation of maximun `audio` dbfs of chunks, the audio volume will not gain over than the specified value.
limit_noise_peak_dbfs: A float, specifying the limitation of maximun `noise` dbfs of chunks, the noise volume will not gain over than the specified value.
limit_speech_peak_dbfs: A float, specifying the limitation of maximun `speech` dbfs of chunks, the noise volume will not gain over than the specified value.
sample_rate: An integer, specifying the audio sample rate to determining the chunk size for dbfs measurement.
Returns:
A 2-D Tensor with shape [`time-steps`, 1]. Has the same type and shape as `audio`.
"""

audio_len = tf.shape(audio)[0]
noise_len = tf.shape(noise)[0]

audio_mean_dbfs, audio_max_dbfs = audio_to_dbfs(audio, sample_rate, reduce_funcs=[tf.reduce_mean, tf.reduce_max])
target_audio_dbfs = tfv1.random_uniform([], minval=min_audio_dbfs, maxval=max_audio_dbfs)
audio_gain_db = target_audio_dbfs - audio_mean_dbfs

# limit audio peak
audio_gain_db = tf.minimum(limit_audio_peak_dbfs - audio_max_dbfs, audio_gain_db)
target_audio_dbfs = audio_mean_dbfs + audio_gain_db
audio_gain_ratio = tf.math.pow(10.0, audio_gain_db / 20.0)
mixed_audio = tf.multiply(audio, audio_gain_ratio)


if noise_iterator:
n_noise = tfv1.random_uniform([], minval=min_n_noises, maxval=max_n_noises, dtype=tf.int32) if min_n_noises != max_n_noises else min_n_noises
def mix_noise_func(au):
noise = noise_iterator.get_next()
noise, noise_mean_dbfs, noise_max_dbfs = extract_noise(noise, audio_len, sample_rate)
return mix(au, target_audio_dbfs, noise, noise_mean_dbfs, noise_max_dbfs, min_noise_snr_db, max_noise_snr_db, limit_noise_peak_dbfs)
mixed_audio = tf.while_loop(lambda _: True, mix_noise_func, [mixed_audio], maximum_iterations=n_noise)

if speech_iterator:
n_speakers = tfv1.random_uniform([], minval=min_n_speakers, maxval=max_n_speakers, dtype=tf.int32) if min_n_speakers != max_n_speakers else min_n_speakers
def mix_speech_func(au):
speech = speech_iterator.get_next()
speech, speech_mean_dbfs, speech_max_dbfs = extract_noise(speech, audio_len, sample_rate)
return mix(au, target_audio_dbfs, speech, speech_mean_dbfs, speech_max_dbfs, min_speech_snr_db, max_speech_snr_db, limit_speech_peak_dbfs)
mixed_audio = tf.while_loop(lambda _: True, mix_speech_func, [mixed_audio], maximum_iterations=n_speakers)

mixed_audio = tf.maximum(tf.minimum(mixed_audio, 1.0), -1.0)

return mixed_audio

def extract_noise(noise, audio_len, sample_rate=16000):
r"""to prepare the mixable noise file out
Args:
noise: A 2-D Tensor with shape [`time-steps`, 1]
audio_len: A tf.int32 scalar, the audio length
sample_rate: An integer, specifying the audio sample rate to determining the chunk size for dbfs measurement.
Returns:
A 2-D Tensor with shape [`audio_len`, 1].
A float, the extracted noise mean dbfs
A float, the extracted noise max dbfs
"""
noise_len = tf.shape(noise)[0]
multiply = tf.math.floordiv(audio_len, noise_len) + 1
noise_tile = tf.tile(noise, [multiply, 1])


# Now, noise_len must > audio_len
noise_tile_len = tf.shape(noise_tile)[0]

mix_decoded_start_point = tfv1.random_uniform([], minval=0, maxval=noise_tile_len-audio_len, dtype=tf.int32)
mix_decoded_end_point = mix_decoded_start_point + audio_len
extract_noise = noise_tile[mix_decoded_start_point:mix_decoded_end_point, :]

extract_noise_mean_dbfs, extract_noise_max_dbfs = audio_to_dbfs(extract_noise, sample_rate, reduce_funcs=[tf.reduce_mean, tf.reduce_max])

target_audio_dbfs = tfv1.random_uniform([], minval=min_audio_dbfs, maxval=max_audio_dbfs)
extracted_noise = noise_tile[mix_decoded_start_point:mix_decoded_end_point, :]
extracted_noise_mean_dbfs, extracted_noise_max_dbfs = audio_to_dbfs(extracted_noise, sample_rate, reduce_funcs=[tf.reduce_mean, tf.reduce_max])
return extracted_noise, extracted_noise_mean_dbfs, extracted_noise_max_dbfs

audio_gain_db = target_audio_dbfs - audio_mean_dbfs
def mix(audio, audio_dbfs, noise, noise_mean_dbfs, noise_max_dbfs, min_noise_snr_db, max_noise_snr_db, limit_noise_peak_dbfs):
r"""The input audio len must equal to noise len
# limit audio peak
audio_gain_db = tf.minimum(limit_audio_peak_dbfs - audio_max_dbfs, audio_gain_db)
target_audio_dbfs = audio_mean_dbfs + audio_gain_db

audio_gain_ratio = tf.math.pow(10.0, audio_gain_db / 20.0)
Returns:
A 2-D Tensor with shape [`time-steps`, 1]. Has the same type and shape as `audio`.
"""

# target_snr_db := target_audio_dbfs - target_noise_dbfs
target_snr_db = tfv1.random_uniform([], minval=min_snr_db, maxval=max_snr_db)
target_snr_db = tfv1.random_uniform([], minval=min_noise_snr_db, maxval=max_noise_snr_db)

target_noise_dbfs = target_audio_dbfs - target_snr_db
noise_gain_db = target_noise_dbfs - extract_noise_mean_dbfs
target_noise_dbfs = audio_dbfs - target_snr_db
noise_gain_db = target_noise_dbfs - noise_mean_dbfs

# limit noise peak
noise_gain_db = tf.minimum(limit_noise_peak_dbfs - extract_noise_max_dbfs, noise_gain_db)
noise_gain_db = tf.minimum(limit_noise_peak_dbfs - noise_max_dbfs, noise_gain_db)
noise_gain_ratio = tf.math.pow(10.0, noise_gain_db / 20.0)

mixed_audio = tf.multiply(audio, audio_gain_ratio) + tf.multiply(extract_noise, noise_gain_ratio)

mixed_audio = tf.maximum(tf.minimum(mixed_audio, 1.0), -1.0)

return mixed_audio
audio += tf.multiply(noise, noise_gain_ratio)
return audio

def gla(spectrogram):
r"""Use Griffin-Lim algorithm to reconstruct audio and fix iteration=10 to not waste too much performance in prefetch
def gla(spectrogram, n_iter=10):
r"""Use Griffin-Lim algorithm to reconstruct audio
Args:
spectrogram: A 3-D Tensor with shape [1, `time-steps`, `features`].
Expand All @@ -206,7 +255,7 @@ def reconstruct_phases(prev_phases):
rands = tfv1.random_uniform(tf.shape(spectrogram), dtype=tf.float32)
phases = tf.math.exp(tf.complex(0.0, 2.0 * np.pi * rands))

reconstructed_phases = tf.while_loop(lambda _: True, reconstruct_phases, [phases], maximum_iterations=10)
reconstructed_phases = tf.while_loop(lambda _: True, reconstruct_phases, [phases], maximum_iterations=n_iter)
xi = tf.complex(abs_spectrogram, 0.0) * reconstructed_phases
audio = tf.signal.inverse_stft(xi, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length)
return tf.transpose(audio)
36 changes: 21 additions & 15 deletions util/feeding.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,28 +68,35 @@ def samples_to_mfccs(samples, sample_rate, train_phase=False):
FLAGS.augmentation_freq_and_time_masking,
FLAGS.augmentation_pitch_and_tempo_scaling,
FLAGS.augmentation_speed_up_std > 0]):
review_audio = gla(spectrogram)
review_audio = gla(spectrogram, FLAGS.review_audio_gla_iterations)

return mfccs, tf.shape(input=mfccs)[0], review_audio


def audiofile_to_features(wav_filename, train_phase=False, noise_iterator=None):
def audiofile_to_features(wav_filename, train_phase=False, noise_iterator=None, speech_iterator=None):
samples = tf.io.read_file(wav_filename)
decoded = contrib_audio.decode_wav(samples, desired_channels=1)
audio = decoded.audio

# augment audio
if noise_iterator:
noise = noise_iterator.get_next()
if noise_iterator or speech_iterator:
audio = augment_noise(
audio,
noise,
noise_iterator,
speech_iterator,
min_n_noises=FLAGS.audio_aug_min_n_noises,
max_n_noises=FLAGS.audio_aug_max_n_noises,
min_n_speakers=FLAGS.audio_aug_min_n_speakers,
max_n_speakers=FLAGS.audio_aug_max_n_speakers,
min_audio_dbfs=FLAGS.audio_aug_min_audio_dbfs,
max_audio_dbfs=FLAGS.audio_aug_max_audio_dbfs,
min_snr_db=FLAGS.audio_aug_min_snr_db,
max_snr_db=FLAGS.audio_aug_max_snr_db,
min_noise_snr_db=FLAGS.audio_aug_min_noise_snr_db,
max_noise_snr_db=FLAGS.audio_aug_max_noise_snr_db,
min_speech_snr_db=FLAGS.audio_aug_min_speech_snr_db,
max_speech_snr_db=FLAGS.audio_aug_max_speech_snr_db,
limit_audio_peak_dbfs=FLAGS.audio_aug_limit_audio_peak_dbfs,
limit_noise_peak_dbfs=FLAGS.audio_aug_limit_noise_peak_dbfs,
limit_speech_peak_dbfs=FLAGS.audio_aug_limit_speech_peak_dbfs,
sample_rate=FLAGS.audio_sample_rate,
)

Expand All @@ -106,9 +113,9 @@ def audiofile_to_features(wav_filename, train_phase=False, noise_iterator=None):
return features, features_len, review_audio


def entry_to_features(wav_filename, transcript, train_phase, noise_iterator):
def entry_to_features(wav_filename, transcript, train_phase, noise_iterator, speech_iterator):
# https://bugs.python.org/issue32117
features, features_len, review_audio = audiofile_to_features(wav_filename, train_phase=train_phase, noise_iterator=noise_iterator)
features, features_len, review_audio = audiofile_to_features(wav_filename, train_phase=train_phase, noise_iterator=noise_iterator, speech_iterator=speech_iterator)
return wav_filename, features, features_len, tf.SparseTensor(*transcript), review_audio


Expand All @@ -121,7 +128,7 @@ def to_sparse_tuple(sequence):
return indices, sequence, shape


def create_dataset(csvs, batch_size, enable_cache=False, cache_path=None, train_phase=False, noise_sources=None):
def create_dataset(csvs, batch_size, enable_cache=False, cache_path=None, train_phase=False, noise_sources=None, speech_sources=None):
df = read_csvs(csvs)
df.sort_values(by='wav_filesize', inplace=True)

Expand Down Expand Up @@ -156,12 +163,11 @@ def batch_fn(wav_filenames, features, features_len, transcripts, review_audios):

num_gpus = len(Config.available_devices)

if noise_sources:
noise_iterator = create_noise_iterator(noise_sources, read_csvs)
else:
noise_iterator = None
noise_iterator = create_noise_iterator(noise_sources, read_csvs) if noise_sources else None
speech_iterator = create_noise_iterator(speech_sources, read_csvs) if speech_sources else None

process_fn = partial(entry_to_features, train_phase=train_phase, noise_iterator=noise_iterator)

process_fn = partial(entry_to_features, train_phase=train_phase, noise_iterator=noise_iterator, speech_iterator=speech_iterator)

dataset = tf.data.Dataset.from_generator(generate_values,
output_types=(tf.string, (tf.int64, tf.int32, tf.int64)))
Expand Down
Loading

0 comments on commit f7d1279

Please sign in to comment.