diff --git a/bin/normalize_noise_audio.py b/bin/normalize_noise_audio.py new file mode 100644 index 0000000000..2a15fad562 --- /dev/null +++ b/bin/normalize_noise_audio.py @@ -0,0 +1,170 @@ +from __future__ import absolute_import, division, print_function + +# Make sure we can import stuff from util/ +# This script needs to be run from the root of the DeepSpeech repository + +from librosa import get_duration +from multiprocessing import Pool +from functools import partial +import math +import argparse +import sys +import os +import progressbar +sys.path.insert(1, os.path.join(sys.path[0], '..')) + +from util.feeding import secs_to_hours + +try: + from pydub import AudioSegment +except ImportError as err: + print('[ImportError] try `sudo apt-get install ffmpeg && pip install pydub`') + raise err + + +def detect_silence(sound: AudioSegment, silence_threshold=-50.0, chunk_size=10): + start_trim = 0 # ms + sound_size = len(sound) + assert chunk_size > 0 # to avoid infinite loop + while sound[start_trim:(start_trim + chunk_size)].dBFS < silence_threshold and start_trim < sound_size: + start_trim += chunk_size + + end_trim = sound_size + while sound[(end_trim - chunk_size):end_trim].dBFS < silence_threshold and end_trim > 0: + end_trim -= chunk_size + + start_trim = min(sound_size, start_trim) + end_trim = max(0, end_trim) + + return min([start_trim, end_trim]), max([start_trim, end_trim]) + + +def trim_silence_audio(sound: AudioSegment, silence_threshold=-50.0, chunk_size=10): + start_trim, end_trim = detect_silence(sound, silence_threshold, chunk_size) + return sound[start_trim:end_trim] + + +def convert(filename, dst_dirpath, dirpath, normalize, trim_silence, + min_duration_seconds, max_duration_seconds): + if not filename.endswith(('.wav', '.raw')): + return + + filepath = os.path.join(dirpath, filename) + if filename.endswith('.wav'): + sound: AudioSegment = AudioSegment.from_file(filepath) + else: + try: + sound: AudioSegment = AudioSegment.from_raw(filepath, + sample_width=2, + frame_rate=44100, + channels=1) + except Exception as err: # pylint: disable=broad-except + print('Retrying conversion: {}'.format(err)) + try: + sound: AudioSegment = AudioSegment.from_raw(filepath, + sample_width=2, + frame_rate=48000, + channels=1) + except Exception as err: # pylint: disable=broad-except + print('Skipping file {}, got error: {}'.format(filepath, err)) + return + try: + sound = sound.set_frame_rate(16000) + except Exception as err: # pylint: disable=broad-except + print('Skipping {}'.format(err)) + return + + n_splits = max(1, math.ceil(sound.duration_seconds / max_duration_seconds)) + chunk_duration_ms = math.ceil(len(sound) / n_splits) + chunks = [] + + for i in range(n_splits): + end_ms = min((i + 1) * chunk_duration_ms, len(sound)) + chunk = sound[(i * chunk_duration_ms):end_ms] + chunks.append(chunk) + + for i, chunk in enumerate(chunks): + dst_path = os.path.join(dst_dirpath, str(i) + '_' + filename) + if dst_path.endswith('.raw'): + dst_path = dst_path[:-4] + '.wav' + + if os.path.exists(dst_path): + print('Audio already exists: {}'.format(dst_path)) + return + + if normalize: + chunk = chunk.normalize() + if chunk.dBFS < -30.0: + chunk = chunk.compress_dynamic_range().normalize() + if chunk.dBFS < -30.0: + chunk = chunk.compress_dynamic_range().normalize() + if trim_silence: + chunk = trim_silence_audio(chunk) + + if chunk.duration_seconds < min_duration_seconds: + return + chunk.export(dst_path, format='wav') + + +def get_noise_duration(dst_dir): + duration = 0.0 + file_num = 0 + for dirpath, _, filenames in os.walk(dst_dir): + for f in filenames: + if not f.endswith('.wav'): + continue + duration += get_duration(filename=os.path.join(dirpath, f)) + file_num += 1 + return duration, file_num + + +def main(src_dir, + dst_dir, + min_duration_seconds, + max_duration_seconds, + normalize=True, + trim_silence=True): + assert os.path.exists(src_dir) + if not os.path.exists(dst_dir): + os.makedirs(dst_dir, exist_ok=False) + src_dir = os.path.abspath(src_dir) + dst_dir = os.path.abspath(dst_dir) + + for dirpath, _, filenames in os.walk(src_dir): + dirpath = os.path.abspath(dirpath) + dst_dirpath = os.path.join( + dst_dir, dirpath.replace(src_dir, '').lstrip('/')) + + print('Converting directory: {} -> {}'.format(dirpath, dst_dirpath)) + if not os.path.exists(dst_dirpath): + os.makedirs(dst_dirpath, exist_ok=False) + + convert_func = partial(convert, + dst_dirpath=dst_dirpath, + dirpath=dirpath, + normalize=normalize, + trim_silence=trim_silence, + min_duration_seconds=min_duration_seconds, + max_duration_seconds=max_duration_seconds) + + pool = Pool(processes=None) + pbar = progressbar.ProgressBar(prefix='Preparing Noise Dataset', max_value=len(filenames)).start() + for i, _ in enumerate(pool.imap_unordered(convert_func, filenames)): + pbar.update(i) + pbar.finish() + + +if __name__ == "__main__": + PARSER = argparse.ArgumentParser(description='Optimize noise files') + PARSER.add_argument('--from_dir', help='Convert wav from directory', type=str) + PARSER.add_argument('--to_dir', help='save wav to directory', type=str) + PARSER.add_argument('--min_sec', help='min duration seconds of saved file', type=float, default=1.0) + PARSER.add_argument('--max_sec', help='max duration seconds of saved file', type=float, default=30.0) + PARSER.add_argument('--normalize', action='store_true', help='Normalize sound range, default is true', default=True) + PARSER.add_argument('--trim', action='store_true', help='Trim silence, default is true', default=True) + PARAMS = PARSER.parse_args() + + main(PARAMS.from_dir, PARAMS.to_dir, PARAMS.min_sec, PARAMS.max_sec, PARAMS.normalize, PARAMS.trim) + + DURATION, FILE_NUM = get_noise_duration(PARAMS.to_dir) + print("Your noise dataset has {} files and a duration of {}\n".format(FILE_NUM, secs_to_hours(DURATION))) diff --git a/training/deepspeech_training/evaluate.py b/training/deepspeech_training/evaluate.py index 5877b618ad..10043213b4 100755 --- a/training/deepspeech_training/evaluate.py +++ b/training/deepspeech_training/evaluate.py @@ -43,7 +43,7 @@ def sparse_tuple_to_texts(sp_tuple, alphabet): return [alphabet.decode(res) for res in results] -def evaluate(test_csvs, create_model): +def evaluate(test_csvs, create_model, noise_sources=None, speech_sources=None): if FLAGS.scorer_path: scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.scorer_path, Config.alphabet) @@ -51,13 +51,13 @@ def evaluate(test_csvs, create_model): scorer = None test_csvs = FLAGS.test_files.split(',') - test_sets = [create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False) for csv in test_csvs] + test_sets = [create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False, noise_sources=noise_sources, speech_sources=speech_sources) for csv in test_csvs] iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(test_sets[0]), tfv1.data.get_output_shapes(test_sets[0]), output_classes=tfv1.data.get_output_classes(test_sets[0])) test_init_ops = [iterator.make_initializer(test_set) for test_set in test_sets] - batch_wav_filename, (batch_x, batch_x_len), batch_y = iterator.get_next() + batch_wav_filename, (batch_x, batch_x_len), batch_y, _ = iterator.get_next() # One rate per layer no_dropout = [None] * 6 diff --git a/training/deepspeech_training/train.py b/training/deepspeech_training/train.py index eed8cd9eb2..dc48cef3f7 100644 --- a/training/deepspeech_training/train.py +++ b/training/deepspeech_training/train.py @@ -228,7 +228,7 @@ def calculate_mean_edit_distance_and_loss(iterator, dropout, reuse): the decoded result and the batch's original Y. ''' # Obtain the next batch of data - batch_filenames, (batch_x, batch_seq_len), batch_y = iterator.get_next() + batch_filenames, (batch_x, batch_seq_len), batch_y, review_audio = iterator.get_next() if FLAGS.train_cudnn: rnn_impl = rnn_impl_cudnn_rnn @@ -239,7 +239,9 @@ def calculate_mean_edit_distance_and_loss(iterator, dropout, reuse): logits, _ = create_model(batch_x, batch_seq_len, dropout, reuse=reuse, rnn_impl=rnn_impl) # Compute the CTC loss using TensorFlow's `ctc_loss` - total_loss = tfv1.nn.ctc_loss(labels=batch_y, inputs=logits, sequence_length=batch_seq_len) + total_loss = tfv1.nn.ctc_loss(labels=batch_y, + inputs=logits, + sequence_length=batch_seq_len) # Check if any files lead to non finite loss non_finite_files = tf.gather(batch_filenames, tfv1.where(~tf.math.is_finite(total_loss))) @@ -248,7 +250,7 @@ def calculate_mean_edit_distance_and_loss(iterator, dropout, reuse): avg_loss = tf.reduce_mean(input_tensor=total_loss) # Finally we return the average loss - return avg_loss, non_finite_files + return avg_loss, non_finite_files, review_audio # Adam Optimization @@ -309,7 +311,7 @@ def get_tower_results(iterator, optimizer, dropout_rates): with tf.name_scope('tower_%d' % i): # Calculate the avg_loss and mean_edit_distance and retrieve the decoded # batch along with the original batch's labels (Y) of this tower - avg_loss, non_finite_files = calculate_mean_edit_distance_and_loss(iterator, dropout_rates, reuse=i > 0) + avg_loss, non_finite_files, review_audio = calculate_mean_edit_distance_and_loss(iterator, dropout_rates, reuse=i > 0) # Allow for variables to be re-used by the next tower tfv1.get_variable_scope().reuse_variables() @@ -326,6 +328,8 @@ def get_tower_results(iterator, optimizer, dropout_rates): tower_non_finite_files.append(non_finite_files) avg_loss_across_towers = tf.reduce_mean(input_tensor=tower_avg_losses, axis=0) + if FLAGS.review_audio_steps: + tfv1.summary.audio(name='step_audio', tensor=review_audio, sample_rate=FLAGS.audio_sample_rate, collections=['step_audio_summaries']) tfv1.summary.scalar(name='step_loss', tensor=avg_loss_across_towers, collections=['step_summaries']) all_non_finite_files = tf.concat(tower_non_finite_files, axis=0) @@ -415,7 +419,9 @@ def train(): FLAGS.augmentation_freq_and_time_masking or FLAGS.augmentation_pitch_and_tempo_scaling or FLAGS.augmentation_speed_up_std > 0 or - FLAGS.augmentation_sparse_warp): + FLAGS.augmentation_sparse_warp or + FLAGS.train_augmentation_noise_files or + FLAGS.train_augmentation_speech_files): do_cache_dataset = False exception_box = ExceptionBox() @@ -428,7 +434,9 @@ def train(): train_phase=True, exception_box=exception_box, process_ahead=len(Config.available_devices) * FLAGS.train_batch_size * 2, - buffering=FLAGS.read_buffer) + buffering=FLAGS.read_buffer, + noise_sources=FLAGS.train_augmentation_noise_files, + speech_sources=FLAGS.train_augmentation_speech_files) iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(train_set), tfv1.data.get_output_shapes(train_set), @@ -444,7 +452,9 @@ def train(): train_phase=False, exception_box=exception_box, process_ahead=len(Config.available_devices) * FLAGS.dev_batch_size * 2, - buffering=FLAGS.read_buffer) for source in dev_sources] + buffering=FLAGS.read_buffer, + noise_sources=FLAGS.dev_augmentation_noise_files, + speech_sources=FLAGS.dev_augmentation_speech_files) for source in dev_sources] dev_init_ops = [iterator.make_initializer(dev_set) for dev_set in dev_sets] # Dropout @@ -482,6 +492,7 @@ def train(): apply_gradient_op = optimizer.apply_gradients(avg_tower_gradients, global_step=global_step) # Summaries + step_audio_summaries_op = tfv1.summary.merge_all('step_audio_summaries') step_summaries_op = tfv1.summary.merge_all('step_summaries') step_summary_writers = { 'train': tfv1.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'train'), max_queue=120), @@ -541,11 +552,21 @@ def __call__(self, progress, data, **kwargs): session.run(init_op) # Batch loop + + audio_summary_steps = 0 while True: try: - _, current_step, batch_loss, problem_files, step_summary = \ - session.run([train_op, global_step, loss, non_finite_files, step_summaries_op], - feed_dict=feed_dict) + step_audio_summary = None + if audio_summary_steps < FLAGS.review_audio_steps and epoch == 0: + _, current_step, batch_loss, problem_files, step_summary, step_audio_summary = \ + session.run([train_op, global_step, loss, non_finite_files, step_summaries_op, step_audio_summaries_op], + feed_dict=feed_dict) + audio_summary_steps += 1 + else: + _, current_step, batch_loss, problem_files, step_summary = \ + session.run([train_op, global_step, loss, non_finite_files, step_summaries_op], + feed_dict=feed_dict) + exception_box.raise_if_set() except tf.errors.InvalidArgumentError as err: if FLAGS.augmentation_sparse_warp: @@ -566,6 +587,9 @@ def __call__(self, progress, data, **kwargs): pbar.update(step_count) + if step_audio_summary is not None: + step_summary_writer.add_summary(step_audio_summary, current_step) + step_summary_writer.add_summary(step_summary, current_step) if is_train and FLAGS.checkpoint_secs > 0 and time.time() - checkpoint_time > FLAGS.checkpoint_secs: @@ -639,7 +663,7 @@ def __call__(self, progress, data, **kwargs): def test(): - samples = evaluate(FLAGS.test_files.split(','), create_model) + samples = evaluate(FLAGS.test_files.split(','), create_model, noise_sources=FLAGS.test_augmentation_noise_files, speech_sources=FLAGS.test_augmentation_speech_files) if FLAGS.test_output_file: # Save decoded tuples as JSON, converting NumPy floats to Python floats json.dump(samples, open(FLAGS.test_output_file, 'w'), default=float) @@ -651,7 +675,7 @@ def create_inference_graph(batch_size=1, n_steps=16, tflite=False): # Create feature computation graph input_samples = tfv1.placeholder(tf.float32, [Config.audio_window_samples], 'input_samples') samples = tf.expand_dims(input_samples, -1) - mfccs, _ = samples_to_mfccs(samples, FLAGS.audio_sample_rate) + mfccs, _, _ = samples_to_mfccs(samples, FLAGS.audio_sample_rate) mfccs = tf.identity(mfccs, name='mfccs') # Input tensor will be of shape [batch_size, n_steps, 2*n_context+1, n_input] @@ -851,7 +875,7 @@ def do_single_file_inference(input_file_path): # Restore variables from training checkpoint load_graph_for_evaluation(session) - features, features_len = audiofile_to_features(input_file_path) + features, features_len, _ = audiofile_to_features(input_file_path) previous_state_c = np.zeros([1, Config.n_cell_dim]) previous_state_h = np.zeros([1, Config.n_cell_dim]) diff --git a/training/deepspeech_training/util/audio_augmentation.py b/training/deepspeech_training/util/audio_augmentation.py new file mode 100644 index 0000000000..55b8957178 --- /dev/null +++ b/training/deepspeech_training/util/audio_augmentation.py @@ -0,0 +1,261 @@ +from __future__ import absolute_import, division, print_function + +import tensorflow as tf +import tensorflow.compat.v1 as tfv1 +import numpy as np +from tensorflow.python.ops import gen_audio_ops as contrib_audio +import os +from .logging import log_info +from .config import Config + + +DBFS_COEF = 10.0 / np.log(10.0) + +def filename_to_audio(wav_filename): + r"""Decode `wab_filename` and return the audio + + Args: + wav_filename: A str, the path of wav file + + Returns: + A 2-D Tensor with shape [`time-steps`, 1]. + """ + samples = tf.io.read_file(wav_filename) + decoded = contrib_audio.decode_wav(samples, desired_channels=1) + return decoded.audio + +def audio_to_dbfs(audio, sample_rate=16000, chunk_ms=100, reduce_funcs=tf.reduce_mean): + r"""Separately measure the chunks dbfs of `audio`, then return the statistics values through `reduce_funcs + + Args: + audio: A 2-D Tensor with shape [`time-steps`, 1]. + sample_rate: An integer, specifying the audio sample rate to determining the chunk size for dbfs measurement. + chunk_ms: An integer in milliseconds unit, specifying each chunk size for separately measuring dbfs, default is `100ms` + reduce_funcs: A function or A list of function, specifying the statistics method to chunks, default is tf.reduce_mean + + Returns: + A float or A list of float, depends on reduce_funcs is function or list of function + """ + assert chunk_ms % 10 == 0, 'chunk_ms must be a multiple of 10' + + audio_len = tf.shape(audio)[0] + chunk_len = tf.math.floordiv(sample_rate, tf.math.floordiv(1000, chunk_ms)) # default: 1600 + n_chunks = tf.math.floordiv(audio_len, chunk_len) + trim_audio_len = tf.multiply(n_chunks, chunk_len) + audio = audio[:trim_audio_len] + splits = tf.reshape(audio, shape=[n_chunks, -1]) + + squares = tf.square(splits) + means = tf.reduce_mean(squares, axis=1) + + # the statistics functions must execute before tf.log(), or the gain db would be wrong + if not isinstance(reduce_funcs, list): + reduces = reduce_funcs(means) + return DBFS_COEF * tf.math.log(reduces + 1e-8) + + reduces = [reduce_func(means) for reduce_func in reduce_funcs] + return [DBFS_COEF * tf.math.log(reduce + 1e-8) for reduce in reduces] + + +def create_noise_iterator(noise_sources, read_csvs_func): + r"""Create an iterator to yield audio + + Args: + noise_dirs_or_files: A list/tuple of str, the collection source of wav filenames. + read_csvs_func: A function, please specify the `read_csvs()` function from `util/feeding.py`, which is to prevent recursive import error. + + Returns: + An one shot iterator of audio with 2-D Tensor of shape [`time-step`, 1], use `.get_next()` to get the Tensor. + """ + if isinstance(noise_sources, str): + noise_sources = noise_sources.split(',') + + noise_filenames = tf.convert_to_tensor(list(collect_noise_filenames(noise_sources, read_csvs_func)), dtype=tf.string) + log_info("Collect {} noise files for mixing audio".format(noise_filenames.shape[0])) + + noise_dataset = (tf.data.Dataset.from_tensor_slices(noise_filenames) + .shuffle(min(noise_filenames.shape[0], 102400)) + .map(filename_to_audio, num_parallel_calls=tf.data.experimental.AUTOTUNE) + .prefetch(tfv1.data.experimental.AUTOTUNE) + .repeat()) + noise_iterator = tfv1.data.make_one_shot_iterator(noise_dataset) + return noise_iterator + + +def collect_noise_filenames(sources, read_csvs_func): + r"""Collect wav filenames from directories or csv files + + Args: + dirs_or_files: A list/tuple of str, the collection source of wav filenames. + read_csvs_func: A function, please specify the `read_csvs()` function from `util/feeding.py`, which is to prevent recursive import error. + + Returns: + An iterator of str, yield every filename suffix with `.wav` or under `wav_filename` column of DataFrame + """ + + assert isinstance(sources, (list, tuple)) + + for source in sources: + assert os.path.exists(source) + if os.path.isdir(source): + for dirpath, _, filenames in os.walk(source): + for filename in filenames: + if filename.endswith('.wav'): + yield os.path.join(dirpath, filename) + elif os.path.isfile(source): + df = read_csvs_func([source]) + for filename in df['wav_filename']: + yield filename + + +def augment_noise(audio, + noise_iterator=None, + speech_iterator=None, + min_n_noises=0, + max_n_noises=1, + min_n_speakers=0, + max_n_speakers=1, + min_audio_dbfs=-35.0, + max_audio_dbfs=0.0, + min_noise_snr_db=3.0, + max_noise_snr_db=30.0, + min_speech_snr_db=3.0, + max_speech_snr_db=30.0, + limit_audio_peak_dbfs=7.0, + limit_noise_peak_dbfs=3.0, + limit_speech_peak_dbfs=7.0, + sample_rate=16000): + r"""Mix audio Tensor with noise Tensor + + If the noise length is shorter than audio, the process will automaticaly repeat the noise file to over audio length, + The process randomly choose a duration of the noise to complete coverage the audio, + i.e. the shapes between the choosen duration of noise and audio are equal. + + Args: + audio: A 2-D Tensor with shape [`time-steps`, 1]. + noise_iterator: A one shot iterator for noise file, the yield item shape is [`time-steps`, 1]. + speech_iterator: A one shot iterator for speech file, the yield item shape is [`time-steps`, 1]. + min_n_noises: A int, min number of the noises per audio mixing + max_n_noises: A int, 'max number of the noises per audio mixing + min_n_speakers: A int, min number of the speakers per audio mixing + max_n_speakers: A int, max number of the speakers per audio mixing + min_audio_dbfs: A float in dbfs unit, specifying the `minimum` volume of audio during gaining audio. + max_audio_dbfs: A float in dbfs unit, specifying the `maximum` volume of audio during gaining audio. + min_noise_snr_db: A float in db unit, specifying the `minimum` signal-to-noise ratio during gaining noise. + max_noise_snr_db: A float in db unit, specifying the `maximum` signal-to-noise ratio during gaining noise. + min_speech_snr_db: A float in db unit, specifying the `minimum` signal-to-noise ratio during gaining speech. + max_speech_snr_db: A float in db unit, specifying the `maximum` signal-to-noise ratio during gaining speech. + limit_audio_peak_dbfs: A float, specifying the limitation of maximun `audio` dbfs of chunks, the audio volume will not gain over than the specified value. + limit_noise_peak_dbfs: A float, specifying the limitation of maximun `noise` dbfs of chunks, the noise volume will not gain over than the specified value. + limit_speech_peak_dbfs: A float, specifying the limitation of maximun `speech` dbfs of chunks, the noise volume will not gain over than the specified value. + sample_rate: An integer, specifying the audio sample rate to determining the chunk size for dbfs measurement. + + Returns: + A 2-D Tensor with shape [`time-steps`, 1]. Has the same type and shape as `audio`. + """ + + audio_len = tf.shape(audio)[0] + audio_mean_dbfs, audio_max_dbfs = audio_to_dbfs(audio, sample_rate, reduce_funcs=[tf.reduce_mean, tf.reduce_max]) + target_audio_dbfs = tfv1.random_uniform([], minval=min_audio_dbfs, maxval=max_audio_dbfs) + audio_gain_db = target_audio_dbfs - audio_mean_dbfs + + # limit audio peak + audio_gain_db = tf.minimum(limit_audio_peak_dbfs - audio_max_dbfs, audio_gain_db) + target_audio_dbfs = audio_mean_dbfs + audio_gain_db + audio_gain_ratio = tf.math.pow(10.0, audio_gain_db / 20.0) + mixed_audio = tf.multiply(audio, audio_gain_ratio) + + + if noise_iterator: + n_noise = tfv1.random_uniform([], minval=min_n_noises, maxval=max_n_noises, dtype=tf.int32) if min_n_noises != max_n_noises else min_n_noises + def mix_noise_func(au): + noise = noise_iterator.get_next() + noise, noise_mean_dbfs, noise_max_dbfs = extract_noise(noise, audio_len, sample_rate) + return mix(au, target_audio_dbfs, noise, noise_mean_dbfs, noise_max_dbfs, min_noise_snr_db, max_noise_snr_db, limit_noise_peak_dbfs) + mixed_audio = tf.while_loop(lambda _: True, mix_noise_func, [mixed_audio], maximum_iterations=n_noise) + + if speech_iterator: + n_speakers = tfv1.random_uniform([], minval=min_n_speakers, maxval=max_n_speakers, dtype=tf.int32) if min_n_speakers != max_n_speakers else min_n_speakers + def mix_speech_func(au): + speech = speech_iterator.get_next() + speech, speech_mean_dbfs, speech_max_dbfs = extract_noise(speech, audio_len, sample_rate) + return mix(au, target_audio_dbfs, speech, speech_mean_dbfs, speech_max_dbfs, min_speech_snr_db, max_speech_snr_db, limit_speech_peak_dbfs) + mixed_audio = tf.while_loop(lambda _: True, mix_speech_func, [mixed_audio], maximum_iterations=n_speakers) + + mixed_audio = tf.maximum(tf.minimum(mixed_audio, 1.0), -1.0) + + return mixed_audio + +def extract_noise(noise, audio_len, sample_rate=16000): + r"""to prepare the mixable noise file out + + Args: + noise: A 2-D Tensor with shape [`time-steps`, 1] + audio_len: A tf.int32 scalar, the audio length + sample_rate: An integer, specifying the audio sample rate to determining the chunk size for dbfs measurement. + Returns: + A 2-D Tensor with shape [`audio_len`, 1]. + A float, the extracted noise mean dbfs + A float, the extracted noise max dbfs + """ + noise_len = tf.shape(noise)[0] + multiply = tf.math.floordiv(audio_len, noise_len) + 1 + noise_tile = tf.tile(noise, [multiply, 1]) + + # Now, noise_len must > audio_len + noise_tile_len = tf.shape(noise_tile)[0] + + mix_decoded_start_point = tfv1.random_uniform([], minval=0, maxval=noise_tile_len-audio_len, dtype=tf.int32) + mix_decoded_end_point = mix_decoded_start_point + audio_len + extracted_noise = noise_tile[mix_decoded_start_point:mix_decoded_end_point, :] + extracted_noise_mean_dbfs, extracted_noise_max_dbfs = audio_to_dbfs(extracted_noise, sample_rate, reduce_funcs=[tf.reduce_mean, tf.reduce_max]) + return extracted_noise, extracted_noise_mean_dbfs, extracted_noise_max_dbfs + +def mix(audio, audio_dbfs, noise, noise_mean_dbfs, noise_max_dbfs, min_noise_snr_db, max_noise_snr_db, limit_noise_peak_dbfs): + r"""The input audio len must equal to noise len + + Returns: + A 2-D Tensor with shape [`time-steps`, 1]. Has the same type and shape as `audio`. + """ + + # target_snr_db := target_audio_dbfs - target_noise_dbfs + target_snr_db = tfv1.random_uniform([], minval=min_noise_snr_db, maxval=max_noise_snr_db) + + target_noise_dbfs = audio_dbfs - target_snr_db + noise_gain_db = target_noise_dbfs - noise_mean_dbfs + + # limit noise peak + noise_gain_db = tf.minimum(limit_noise_peak_dbfs - noise_max_dbfs, noise_gain_db) + noise_gain_ratio = tf.math.pow(10.0, noise_gain_db / 20.0) + + audio += tf.multiply(noise, noise_gain_ratio) + return audio + +def gla(spectrogram, n_iter=10): + r"""Use Griffin-Lim algorithm to reconstruct audio + + Args: + spectrogram: A 3-D Tensor with shape [1, `time-steps`, `features`]. + Returns: + A 2-D Tensor with shape [`time-steps`, 1], which is a reconstructed audio from spectrogram. + """ + frame_length = int(Config.audio_window_samples) + frame_step = int(Config.audio_step_samples) + fft_length = 512 + spectrogram = tf.reshape(spectrogram, shape=[1, -1, 257]) + abs_spectrogram = tf.abs(spectrogram) + + def reconstruct_phases(prev_phases): + xi = tf.complex(abs_spectrogram, 0.0) * prev_phases + audio = tf.signal.inverse_stft(xi, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length) + next_xi = tf.signal.stft(audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length) + next_phases = tf.math.exp(tf.complex(0.0, tf.angle(next_xi))) + return next_phases + + rands = tfv1.random_uniform(tf.shape(spectrogram), dtype=tf.float32) + phases = tf.math.exp(tf.complex(0.0, 2.0 * np.pi * rands)) + + reconstructed_phases = tf.while_loop(lambda _: True, reconstruct_phases, [phases], maximum_iterations=n_iter) + xi = tf.complex(abs_spectrogram, 0.0) * reconstructed_phases + audio = tf.signal.inverse_stft(xi, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length) + return tf.transpose(audio) diff --git a/training/deepspeech_training/util/feeding.py b/training/deepspeech_training/util/feeding.py index 79c2da75d0..2d272c6c78 100644 --- a/training/deepspeech_training/util/feeding.py +++ b/training/deepspeech_training/util/feeding.py @@ -2,8 +2,10 @@ from __future__ import absolute_import, division, print_function from functools import partial +import os import numpy as np +import pandas import tensorflow as tf from tensorflow.python.ops import gen_audio_ops as contrib_audio @@ -15,6 +17,19 @@ from .audio import change_audio_types, read_frames_from_file, vad_split, pcm_to_np, DEFAULT_FORMAT, AUDIO_TYPE_NP from .sample_collections import samples_from_files from .helpers import remember_exception, MEGABYTE +from .audio_augmentation import augment_noise, create_noise_iterator, gla + + +def read_csvs(csv_files): + sets = [] + for csv in csv_files: + file = pandas.read_csv(csv, encoding='utf-8', na_filter=False) + #FIXME: not cross-platform + csv_dir = os.path.dirname(os.path.abspath(csv)) + file['wav_filename'] = file['wav_filename'].str.replace(r'(^[^/])', lambda m: os.path.join(csv_dir, m.group(1))) # pylint: disable=cell-var-from-loop + sets.append(file) + # Concat all sets, drop any extra columns, re-index the final result as 0..N + return pandas.concat(sets, join='inner', ignore_index=True) def samples_to_mfccs(samples, sample_rate, train_phase=False, sample_id=None): @@ -69,12 +84,43 @@ def samples_to_mfccs(samples, sample_rate, train_phase=False, sample_id=None): upper_frequency_limit=FLAGS.audio_sample_rate/2) mfccs = tf.reshape(mfccs, [-1, Config.n_input]) - return mfccs, tf.shape(input=mfccs)[0] - - -def audio_to_features(audio, sample_rate, train_phase=False, sample_id=None): - features, features_len = samples_to_mfccs(audio, sample_rate, train_phase=train_phase, sample_id=sample_id) - + review_audio = samples + if FLAGS.review_audio_steps and train_phase and any([ + FLAGS.augmentation_spec_dropout_keeprate < 1, + FLAGS.augmentation_freq_and_time_masking, + FLAGS.augmentation_pitch_and_tempo_scaling, + FLAGS.augmentation_speed_up_std > 0]): + review_audio = gla(spectrogram, FLAGS.review_audio_gla_iterations) + + return mfccs, tf.shape(input=mfccs)[0], review_audio + +def audio_to_features(audio, sample_rate, train_phase=False, sample_id=None, noise_iterator=None, speech_iterator=None): + + # augment audio + if noise_iterator or speech_iterator: + audio = augment_noise( + audio, + noise_iterator, + speech_iterator, + min_n_noises=FLAGS.audio_aug_min_n_noises, + max_n_noises=FLAGS.audio_aug_max_n_noises, + min_n_speakers=FLAGS.audio_aug_min_n_speakers, + max_n_speakers=FLAGS.audio_aug_max_n_speakers, + min_audio_dbfs=FLAGS.audio_aug_min_audio_dbfs, + max_audio_dbfs=FLAGS.audio_aug_max_audio_dbfs, + min_noise_snr_db=FLAGS.audio_aug_min_noise_snr_db, + max_noise_snr_db=FLAGS.audio_aug_max_noise_snr_db, + min_speech_snr_db=FLAGS.audio_aug_min_speech_snr_db, + max_speech_snr_db=FLAGS.audio_aug_max_speech_snr_db, + limit_audio_peak_dbfs=FLAGS.audio_aug_limit_audio_peak_dbfs, + limit_noise_peak_dbfs=FLAGS.audio_aug_limit_noise_peak_dbfs, + limit_speech_peak_dbfs=FLAGS.audio_aug_limit_speech_peak_dbfs, + sample_rate=FLAGS.audio_sample_rate, + ) + + features, features_len, review_audio = samples_to_mfccs(audio, sample_rate, train_phase=train_phase, sample_id=sample_id) + + # augment features if train_phase: if FLAGS.data_aug_features_multiplicative > 0: features = features*tf.random.normal(mean=1, stddev=FLAGS.data_aug_features_multiplicative, shape=tf.shape(features)) @@ -82,20 +128,20 @@ def audio_to_features(audio, sample_rate, train_phase=False, sample_id=None): if FLAGS.data_aug_features_additive > 0: features = features+tf.random.normal(mean=0.0, stddev=FLAGS.data_aug_features_additive, shape=tf.shape(features)) - return features, features_len + return features, features_len, review_audio -def audiofile_to_features(wav_filename, train_phase=False): +def audiofile_to_features(wav_filename, train_phase=False, noise_iterator=None, speech_iterator=None): samples = tf.io.read_file(wav_filename) decoded = contrib_audio.decode_wav(samples, desired_channels=1) - return audio_to_features(decoded.audio, decoded.sample_rate, train_phase=train_phase, sample_id=wav_filename) + return audio_to_features(decoded.audio, decoded.sample_rate, train_phase=train_phase, sample_id=wav_filename, noise_iterator=noise_iterator, speech_iterator=speech_iterator) -def entry_to_features(sample_id, audio, sample_rate, transcript, train_phase=False): +def entry_to_features(sample_id, audio, sample_rate, transcript, train_phase=False, noise_iterator=None, speech_iterator=None): # https://bugs.python.org/issue32117 - features, features_len = audio_to_features(audio, sample_rate, train_phase=train_phase, sample_id=sample_id) + features, features_len, review_audio = audio_to_features(audio, sample_rate, train_phase=train_phase, sample_id=sample_id, noise_iterator=noise_iterator, speech_iterator=speech_iterator) sparse_transcript = tf.SparseTensor(*transcript) - return sample_id, features, features_len, sparse_transcript + return sample_id, features, features_len, sparse_transcript, review_audio def to_sparse_tuple(sequence): @@ -114,7 +160,9 @@ def create_dataset(sources, train_phase=False, exception_box=None, process_ahead=None, - buffering=1 * MEGABYTE): + buffering=1 * MEGABYTE, + noise_sources=None, + speech_sources=None): def generate_values(): samples = samples_from_files(sources, buffering=buffering, labeled=True) for sample in change_audio_types(samples, @@ -131,14 +179,25 @@ def sparse_reshape(sparse): shape = sparse.dense_shape return tf.sparse.reshape(sparse, [shape[0], shape[2]]) - def batch_fn(sample_ids, features, features_len, transcripts): + def batch_fn(sample_ids, features, features_len, transcripts, review_audios): features = tf.data.Dataset.zip((features, features_len)) features = features.padded_batch(batch_size, padded_shapes=([None, Config.n_input], [])) transcripts = transcripts.batch(batch_size).map(sparse_reshape) sample_ids = sample_ids.batch(batch_size) - return tf.data.Dataset.zip((sample_ids, features, transcripts)) - process_fn = partial(entry_to_features, train_phase=train_phase) + # In order not to waste too much prefetch performance, randomly extract only `one` audio for each step + if FLAGS.review_audio_steps and batch_size > 1: + skip_size = tf.random.uniform(shape=[], minval=0, maxval=batch_size - 1, dtype=tf.int64) + review_audio = review_audios.skip(skip_size).batch(1) + else: + review_audio = review_audios.batch(1) + + return tf.data.Dataset.zip((sample_ids, features, transcripts, review_audio)) + + noise_iterator = create_noise_iterator(noise_sources, read_csvs) if noise_sources else None + speech_iterator = create_noise_iterator(speech_sources, read_csvs) if speech_sources else None + + process_fn = partial(entry_to_features, train_phase=train_phase, noise_iterator=noise_iterator, speech_iterator=speech_iterator) dataset = (tf.data.Dataset.from_generator(remember_exception(generate_values, exception_box), output_types=(tf.string, tf.float32, tf.int32, @@ -167,7 +226,7 @@ def generate_values(): yield time_start, time_end, samples def to_mfccs(time_start, time_end, samples): - features, features_len = samples_to_mfccs(samples, audio_format[0]) + features, features_len, _ = samples_to_mfccs(samples, audio_format[0]) return time_start, time_end, features, features_len def create_batch_set(bs, criteria): @@ -185,3 +244,9 @@ def create_batch_set(bs, criteria): dataset = nds.concatenate(ods) dataset = dataset.prefetch(len(Config.available_devices)) return dataset + + +def secs_to_hours(secs): + hours, remainder = divmod(secs, 3600) + minutes, seconds = divmod(remainder, 60) + return '%d:%02d:%02d' % (hours, minutes, seconds) diff --git a/training/deepspeech_training/util/flags.py b/training/deepspeech_training/util/flags.py index 63bc7bb4ed..9d415c7a88 100644 --- a/training/deepspeech_training/util/flags.py +++ b/training/deepspeech_training/util/flags.py @@ -25,6 +25,27 @@ def create_flags(): # Data Augmentation # ================ + f.DEFINE_string('train_augmentation_noise_files', '', 'comma separated list of files or dirs, specifying the noise dataset used for mixing audio into train dataset, if empty, no mix will be run in train phase') + f.DEFINE_string('dev_augmentation_noise_files', '', 'comma separated list of files or dirs, specifying the noise dataset used for mixing audio into dev dataset, if empty, no mix will be run in dev phase') + f.DEFINE_string('test_augmentation_noise_files', '', 'comma separated list of files or dirs, specifying the noise dataset used for mixing audio into test dataset, if empty, no mix will be run in test phase') + f.DEFINE_string('train_augmentation_speech_files', '', 'comma separated list of files or dirs, specifying the speech dataset used for mixing audio into train dataset, if empty, no mix will be run in train phase') + f.DEFINE_string('dev_augmentation_speech_files', '', 'comma separated list of files or dirs, specifying the speech dataset used for mixing audio into dev dataset, if empty, no mix will be run in dev phase') + f.DEFINE_string('test_augmentation_speech_files', '', 'comma separated list of files or dirs, specifying the speech dataset used for mixing audio into test dataset, if empty, no mix will be run in test phase') + f.DEFINE_float('audio_aug_min_audio_dbfs', -35, 'min value of dbfs to specify the min volume of audio during gaining audio') + f.DEFINE_float('audio_aug_max_audio_dbfs', 0, 'max value of dbfs to specify the max volume of audio during gaining audio') + f.DEFINE_float('audio_aug_min_noise_snr_db', 3, 'min value of db to specify the min signal-to-noise ratio during gaining noise') + f.DEFINE_float('audio_aug_max_noise_snr_db', 30, 'max value of db to specify the max signal-to-noise ratio during gaining noise') + f.DEFINE_float('audio_aug_min_speech_snr_db', 10, 'min value of db to specify the min signal-to-noise ratio during gaining speech') + f.DEFINE_float('audio_aug_max_speech_snr_db', 50, 'max value of db to specify the max signal-to-noise ratio during gaining speech') + f.DEFINE_float('audio_aug_limit_audio_peak_dbfs', 7.0, 'max value of dbfs to specify the limitation of max audio dbfs of chunks, the audio volume will not gain over than the specified value') + f.DEFINE_float('audio_aug_limit_noise_peak_dbfs', 3.0, 'max value of dbfs to specify the limitation of max noise dbfs of chunks, the noise volume will not gain over than the specified value') + f.DEFINE_float('audio_aug_limit_speech_peak_dbfs', 3.0, 'max value of dbfs to specify the limitation of max speech dbfs of chunks, the speech volume will not gain over than the specified value') + f.DEFINE_integer('audio_aug_min_n_noises', 0, 'min number of the noises per audio mixing') + f.DEFINE_integer('audio_aug_max_n_noises', 1, 'max number of the noises per audio mixing') + f.DEFINE_integer('audio_aug_min_n_speakers', 0, 'min number of the speakers per audio mixing') + f.DEFINE_integer('audio_aug_max_n_speakers', 1, 'max number of the speakers per audio mixing') + + f.DEFINE_float('data_aug_features_additive', 0, 'std of the Gaussian additive noise') f.DEFINE_float('data_aug_features_multiplicative', 0, 'std of normal distribution around 1 for multiplicative noise') @@ -50,6 +71,9 @@ def create_flags(): f.DEFINE_float('augmentation_pitch_and_tempo_scaling_max_pitch', 1.2, 'max value of pitch scaling') f.DEFINE_float('augmentation_pitch_and_tempo_scaling_max_tempo', 1.2, 'max vlaue of tempo scaling') + f.DEFINE_integer('review_audio_steps', 0, 'number of audio, push the audio into summary directory (if 0, no audio will be dumped), one file per step is saved until the given count is reached') + f.DEFINE_integer('review_audio_gla_iterations', 10, 'number of iteration to reconstruct audio from features, using Griffin-Lim Algorithm') + # Global Constants # ================