diff --git a/bin/normalize_noise_audio.py b/bin/normalize_noise_audio.py
new file mode 100644
index 0000000000..2a15fad562
--- /dev/null
+++ b/bin/normalize_noise_audio.py
@@ -0,0 +1,170 @@
+from __future__ import absolute_import, division, print_function
+
+# Make sure we can import stuff from util/
+# This script needs to be run from the root of the DeepSpeech repository
+
+from librosa import get_duration
+from multiprocessing import Pool
+from functools import partial
+import math
+import argparse
+import sys
+import os
+import progressbar
+sys.path.insert(1, os.path.join(sys.path[0], '..'))
+
+from util.feeding import secs_to_hours
+
+try:
+    from pydub import AudioSegment
+except ImportError as err:
+    print('[ImportError] try `sudo apt-get install ffmpeg && pip install pydub`')
+    raise err
+
+
+def detect_silence(sound: AudioSegment, silence_threshold=-50.0, chunk_size=10):
+    start_trim = 0  # ms
+    sound_size = len(sound)
+    assert chunk_size > 0  # to avoid infinite loop
+    while sound[start_trim:(start_trim + chunk_size)].dBFS < silence_threshold and start_trim < sound_size:
+        start_trim += chunk_size
+
+    end_trim = sound_size
+    while sound[(end_trim - chunk_size):end_trim].dBFS < silence_threshold and end_trim > 0:
+        end_trim -= chunk_size
+
+    start_trim = min(sound_size, start_trim)
+    end_trim = max(0, end_trim)
+
+    return min([start_trim, end_trim]), max([start_trim, end_trim])
+
+
+def trim_silence_audio(sound: AudioSegment, silence_threshold=-50.0, chunk_size=10):
+    start_trim, end_trim = detect_silence(sound, silence_threshold, chunk_size)
+    return sound[start_trim:end_trim]
+
+
+def convert(filename, dst_dirpath, dirpath, normalize, trim_silence,
+            min_duration_seconds, max_duration_seconds):
+    if not filename.endswith(('.wav', '.raw')):
+        return
+
+    filepath = os.path.join(dirpath, filename)
+    if filename.endswith('.wav'):
+        sound: AudioSegment = AudioSegment.from_file(filepath)
+    else:
+        try:
+            sound: AudioSegment = AudioSegment.from_raw(filepath,
+                                                        sample_width=2,
+                                                        frame_rate=44100,
+                                                        channels=1)
+        except Exception as err:  # pylint: disable=broad-except
+            print('Retrying conversion: {}'.format(err))
+            try:
+                sound: AudioSegment = AudioSegment.from_raw(filepath,
+                                                            sample_width=2,
+                                                            frame_rate=48000,
+                                                            channels=1)
+            except Exception as err:  # pylint: disable=broad-except
+                print('Skipping file {}, got error: {}'.format(filepath, err))
+                return
+        try:
+            sound = sound.set_frame_rate(16000)
+        except Exception as err:  # pylint: disable=broad-except
+            print('Skipping {}'.format(err))
+            return
+
+    n_splits = max(1, math.ceil(sound.duration_seconds / max_duration_seconds))
+    chunk_duration_ms = math.ceil(len(sound) / n_splits)
+    chunks = []
+
+    for i in range(n_splits):
+        end_ms = min((i + 1) * chunk_duration_ms, len(sound))
+        chunk = sound[(i * chunk_duration_ms):end_ms]
+        chunks.append(chunk)
+
+    for i, chunk in enumerate(chunks):
+        dst_path = os.path.join(dst_dirpath, str(i) + '_' + filename)
+        if dst_path.endswith('.raw'):
+            dst_path = dst_path[:-4] + '.wav'
+
+        if os.path.exists(dst_path):
+            print('Audio already exists: {}'.format(dst_path))
+            return
+
+        if normalize:
+            chunk = chunk.normalize()
+            if chunk.dBFS < -30.0:
+                chunk = chunk.compress_dynamic_range().normalize()
+            if chunk.dBFS < -30.0:
+                chunk = chunk.compress_dynamic_range().normalize()
+        if trim_silence:
+            chunk = trim_silence_audio(chunk)
+
+        if chunk.duration_seconds < min_duration_seconds:
+            return
+        chunk.export(dst_path, format='wav')
+
+
+def get_noise_duration(dst_dir):
+    duration = 0.0
+    file_num = 0
+    for dirpath, _, filenames in os.walk(dst_dir):
+        for f in filenames:
+            if not f.endswith('.wav'):
+                continue
+            duration += get_duration(filename=os.path.join(dirpath, f))
+            file_num += 1
+    return duration, file_num
+
+
+def main(src_dir,
+         dst_dir,
+         min_duration_seconds,
+         max_duration_seconds,
+         normalize=True,
+         trim_silence=True):
+    assert os.path.exists(src_dir)
+    if not os.path.exists(dst_dir):
+        os.makedirs(dst_dir, exist_ok=False)
+    src_dir = os.path.abspath(src_dir)
+    dst_dir = os.path.abspath(dst_dir)
+
+    for dirpath, _, filenames in os.walk(src_dir):
+        dirpath = os.path.abspath(dirpath)
+        dst_dirpath = os.path.join(
+            dst_dir, dirpath.replace(src_dir, '').lstrip('/'))
+
+        print('Converting directory: {} -> {}'.format(dirpath, dst_dirpath))
+        if not os.path.exists(dst_dirpath):
+            os.makedirs(dst_dirpath, exist_ok=False)
+
+        convert_func = partial(convert,
+                               dst_dirpath=dst_dirpath,
+                               dirpath=dirpath,
+                               normalize=normalize,
+                               trim_silence=trim_silence,
+                               min_duration_seconds=min_duration_seconds,
+                               max_duration_seconds=max_duration_seconds)
+
+        pool = Pool(processes=None)
+        pbar = progressbar.ProgressBar(prefix='Preparing Noise Dataset', max_value=len(filenames)).start()
+        for i, _ in enumerate(pool.imap_unordered(convert_func, filenames)):
+            pbar.update(i)
+        pbar.finish()
+
+
+if __name__ == "__main__":
+    PARSER = argparse.ArgumentParser(description='Optimize noise files')
+    PARSER.add_argument('--from_dir', help='Convert wav from directory', type=str)
+    PARSER.add_argument('--to_dir', help='save wav to directory', type=str)
+    PARSER.add_argument('--min_sec', help='min duration seconds of saved file', type=float, default=1.0)
+    PARSER.add_argument('--max_sec', help='max duration seconds of saved file', type=float, default=30.0)
+    PARSER.add_argument('--normalize', action='store_true', help='Normalize sound range, default is true', default=True)
+    PARSER.add_argument('--trim', action='store_true', help='Trim silence, default is true', default=True)
+    PARAMS = PARSER.parse_args()
+
+    main(PARAMS.from_dir, PARAMS.to_dir, PARAMS.min_sec, PARAMS.max_sec, PARAMS.normalize, PARAMS.trim)
+
+    DURATION, FILE_NUM = get_noise_duration(PARAMS.to_dir)
+    print("Your noise dataset has {} files and a duration of {}\n".format(FILE_NUM, secs_to_hours(DURATION)))
diff --git a/training/deepspeech_training/evaluate.py b/training/deepspeech_training/evaluate.py
index 5877b618ad..10043213b4 100755
--- a/training/deepspeech_training/evaluate.py
+++ b/training/deepspeech_training/evaluate.py
@@ -43,7 +43,7 @@ def sparse_tuple_to_texts(sp_tuple, alphabet):
     return [alphabet.decode(res) for res in results]
 
 
-def evaluate(test_csvs, create_model):
+def evaluate(test_csvs, create_model, noise_sources=None, speech_sources=None):
     if FLAGS.scorer_path:
         scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta,
                         FLAGS.scorer_path, Config.alphabet)
@@ -51,13 +51,13 @@ def evaluate(test_csvs, create_model):
         scorer = None
 
     test_csvs = FLAGS.test_files.split(',')
-    test_sets = [create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False) for csv in test_csvs]
+    test_sets = [create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False, noise_sources=noise_sources, speech_sources=speech_sources) for csv in test_csvs]
     iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(test_sets[0]),
                                                  tfv1.data.get_output_shapes(test_sets[0]),
                                                  output_classes=tfv1.data.get_output_classes(test_sets[0]))
     test_init_ops = [iterator.make_initializer(test_set) for test_set in test_sets]
 
-    batch_wav_filename, (batch_x, batch_x_len), batch_y = iterator.get_next()
+    batch_wav_filename, (batch_x, batch_x_len), batch_y, _ = iterator.get_next()
 
     # One rate per layer
     no_dropout = [None] * 6
diff --git a/training/deepspeech_training/train.py b/training/deepspeech_training/train.py
index eed8cd9eb2..dc48cef3f7 100644
--- a/training/deepspeech_training/train.py
+++ b/training/deepspeech_training/train.py
@@ -228,7 +228,7 @@ def calculate_mean_edit_distance_and_loss(iterator, dropout, reuse):
     the decoded result and the batch's original Y.
     '''
     # Obtain the next batch of data
-    batch_filenames, (batch_x, batch_seq_len), batch_y = iterator.get_next()
+    batch_filenames, (batch_x, batch_seq_len), batch_y, review_audio = iterator.get_next()
 
     if FLAGS.train_cudnn:
         rnn_impl = rnn_impl_cudnn_rnn
@@ -239,7 +239,9 @@ def calculate_mean_edit_distance_and_loss(iterator, dropout, reuse):
     logits, _ = create_model(batch_x, batch_seq_len, dropout, reuse=reuse, rnn_impl=rnn_impl)
 
     # Compute the CTC loss using TensorFlow's `ctc_loss`
-    total_loss = tfv1.nn.ctc_loss(labels=batch_y, inputs=logits, sequence_length=batch_seq_len)
+    total_loss = tfv1.nn.ctc_loss(labels=batch_y,
+                                  inputs=logits,
+                                  sequence_length=batch_seq_len)
 
     # Check if any files lead to non finite loss
     non_finite_files = tf.gather(batch_filenames, tfv1.where(~tf.math.is_finite(total_loss)))
@@ -248,7 +250,7 @@ def calculate_mean_edit_distance_and_loss(iterator, dropout, reuse):
     avg_loss = tf.reduce_mean(input_tensor=total_loss)
 
     # Finally we return the average loss
-    return avg_loss, non_finite_files
+    return avg_loss, non_finite_files, review_audio
 
 
 # Adam Optimization
@@ -309,7 +311,7 @@ def get_tower_results(iterator, optimizer, dropout_rates):
                 with tf.name_scope('tower_%d' % i):
                     # Calculate the avg_loss and mean_edit_distance and retrieve the decoded
                     # batch along with the original batch's labels (Y) of this tower
-                    avg_loss, non_finite_files = calculate_mean_edit_distance_and_loss(iterator, dropout_rates, reuse=i > 0)
+                    avg_loss, non_finite_files, review_audio = calculate_mean_edit_distance_and_loss(iterator, dropout_rates, reuse=i > 0)
 
                     # Allow for variables to be re-used by the next tower
                     tfv1.get_variable_scope().reuse_variables()
@@ -326,6 +328,8 @@ def get_tower_results(iterator, optimizer, dropout_rates):
                     tower_non_finite_files.append(non_finite_files)
 
     avg_loss_across_towers = tf.reduce_mean(input_tensor=tower_avg_losses, axis=0)
+    if FLAGS.review_audio_steps:
+        tfv1.summary.audio(name='step_audio', tensor=review_audio, sample_rate=FLAGS.audio_sample_rate, collections=['step_audio_summaries'])
     tfv1.summary.scalar(name='step_loss', tensor=avg_loss_across_towers, collections=['step_summaries'])
 
     all_non_finite_files = tf.concat(tower_non_finite_files, axis=0)
@@ -415,7 +419,9 @@ def train():
             FLAGS.augmentation_freq_and_time_masking or
             FLAGS.augmentation_pitch_and_tempo_scaling or
             FLAGS.augmentation_speed_up_std > 0 or
-            FLAGS.augmentation_sparse_warp):
+            FLAGS.augmentation_sparse_warp or
+            FLAGS.train_augmentation_noise_files or
+            FLAGS.train_augmentation_speech_files):
         do_cache_dataset = False
 
     exception_box = ExceptionBox()
@@ -428,7 +434,9 @@ def train():
                                train_phase=True,
                                exception_box=exception_box,
                                process_ahead=len(Config.available_devices) * FLAGS.train_batch_size * 2,
-                               buffering=FLAGS.read_buffer)
+                               buffering=FLAGS.read_buffer,
+                               noise_sources=FLAGS.train_augmentation_noise_files,
+                               speech_sources=FLAGS.train_augmentation_speech_files)
 
     iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(train_set),
                                                  tfv1.data.get_output_shapes(train_set),
@@ -444,7 +452,9 @@ def train():
                                    train_phase=False,
                                    exception_box=exception_box,
                                    process_ahead=len(Config.available_devices) * FLAGS.dev_batch_size * 2,
-                                   buffering=FLAGS.read_buffer) for source in dev_sources]
+                                   buffering=FLAGS.read_buffer,
+                                   noise_sources=FLAGS.dev_augmentation_noise_files,
+                                   speech_sources=FLAGS.dev_augmentation_speech_files) for source in dev_sources]
         dev_init_ops = [iterator.make_initializer(dev_set) for dev_set in dev_sets]
 
     # Dropout
@@ -482,6 +492,7 @@ def train():
     apply_gradient_op = optimizer.apply_gradients(avg_tower_gradients, global_step=global_step)
 
     # Summaries
+    step_audio_summaries_op = tfv1.summary.merge_all('step_audio_summaries')
     step_summaries_op = tfv1.summary.merge_all('step_summaries')
     step_summary_writers = {
         'train': tfv1.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'train'), max_queue=120),
@@ -541,11 +552,21 @@ def __call__(self, progress, data, **kwargs):
             session.run(init_op)
 
             # Batch loop
+
+            audio_summary_steps = 0
             while True:
                 try:
-                    _, current_step, batch_loss, problem_files, step_summary = \
-                        session.run([train_op, global_step, loss, non_finite_files, step_summaries_op],
-                                    feed_dict=feed_dict)
+                    step_audio_summary = None
+                    if audio_summary_steps < FLAGS.review_audio_steps and epoch == 0:
+                        _, current_step, batch_loss, problem_files, step_summary, step_audio_summary = \
+                            session.run([train_op, global_step, loss, non_finite_files, step_summaries_op, step_audio_summaries_op],
+                                        feed_dict=feed_dict)
+                        audio_summary_steps += 1
+                    else:
+                        _, current_step, batch_loss, problem_files, step_summary = \
+                            session.run([train_op, global_step, loss, non_finite_files, step_summaries_op],
+                                        feed_dict=feed_dict)
+
                     exception_box.raise_if_set()
                 except tf.errors.InvalidArgumentError as err:
                     if FLAGS.augmentation_sparse_warp:
@@ -566,6 +587,9 @@ def __call__(self, progress, data, **kwargs):
 
                 pbar.update(step_count)
 
+                if step_audio_summary is not None:
+                    step_summary_writer.add_summary(step_audio_summary, current_step)
+
                 step_summary_writer.add_summary(step_summary, current_step)
 
                 if is_train and FLAGS.checkpoint_secs > 0 and time.time() - checkpoint_time > FLAGS.checkpoint_secs:
@@ -639,7 +663,7 @@ def __call__(self, progress, data, **kwargs):
 
 
 def test():
-    samples = evaluate(FLAGS.test_files.split(','), create_model)
+    samples = evaluate(FLAGS.test_files.split(','), create_model, noise_sources=FLAGS.test_augmentation_noise_files, speech_sources=FLAGS.test_augmentation_speech_files)
     if FLAGS.test_output_file:
         # Save decoded tuples as JSON, converting NumPy floats to Python floats
         json.dump(samples, open(FLAGS.test_output_file, 'w'), default=float)
@@ -651,7 +675,7 @@ def create_inference_graph(batch_size=1, n_steps=16, tflite=False):
     # Create feature computation graph
     input_samples = tfv1.placeholder(tf.float32, [Config.audio_window_samples], 'input_samples')
     samples = tf.expand_dims(input_samples, -1)
-    mfccs, _ = samples_to_mfccs(samples, FLAGS.audio_sample_rate)
+    mfccs, _, _ = samples_to_mfccs(samples, FLAGS.audio_sample_rate)
     mfccs = tf.identity(mfccs, name='mfccs')
 
     # Input tensor will be of shape [batch_size, n_steps, 2*n_context+1, n_input]
@@ -851,7 +875,7 @@ def do_single_file_inference(input_file_path):
         # Restore variables from training checkpoint
         load_graph_for_evaluation(session)
 
-        features, features_len = audiofile_to_features(input_file_path)
+        features, features_len, _ = audiofile_to_features(input_file_path)
         previous_state_c = np.zeros([1, Config.n_cell_dim])
         previous_state_h = np.zeros([1, Config.n_cell_dim])
 
diff --git a/training/deepspeech_training/util/audio_augmentation.py b/training/deepspeech_training/util/audio_augmentation.py
new file mode 100644
index 0000000000..55b8957178
--- /dev/null
+++ b/training/deepspeech_training/util/audio_augmentation.py
@@ -0,0 +1,261 @@
+from __future__ import absolute_import, division, print_function
+
+import tensorflow as tf
+import tensorflow.compat.v1 as tfv1
+import numpy as np
+from tensorflow.python.ops import gen_audio_ops as contrib_audio
+import os
+from .logging import log_info
+from .config import Config
+
+
+DBFS_COEF = 10.0 / np.log(10.0)
+
+def filename_to_audio(wav_filename):
+    r"""Decode `wab_filename` and return the audio
+
+    Args:
+        wav_filename: A str, the path of wav file
+
+    Returns:
+        A 2-D Tensor with shape [`time-steps`, 1].
+    """
+    samples = tf.io.read_file(wav_filename)
+    decoded = contrib_audio.decode_wav(samples, desired_channels=1)
+    return decoded.audio
+
+def audio_to_dbfs(audio, sample_rate=16000, chunk_ms=100, reduce_funcs=tf.reduce_mean):
+    r"""Separately measure the chunks dbfs of `audio`, then return the statistics values through `reduce_funcs
+
+    Args:
+        audio: A 2-D Tensor with shape [`time-steps`, 1].
+        sample_rate: An integer, specifying the audio sample rate to determining the chunk size for dbfs measurement.
+        chunk_ms: An integer in milliseconds unit, specifying each chunk size for separately measuring dbfs, default is `100ms`
+        reduce_funcs: A function or A list of function, specifying the statistics method to chunks, default is tf.reduce_mean
+
+    Returns:
+        A float or A list of float, depends on reduce_funcs is function or list of function
+    """
+    assert chunk_ms % 10 == 0, 'chunk_ms must be a multiple of 10'
+
+    audio_len = tf.shape(audio)[0]
+    chunk_len = tf.math.floordiv(sample_rate, tf.math.floordiv(1000, chunk_ms)) # default: 1600
+    n_chunks = tf.math.floordiv(audio_len, chunk_len)
+    trim_audio_len = tf.multiply(n_chunks, chunk_len)
+    audio = audio[:trim_audio_len]
+    splits = tf.reshape(audio, shape=[n_chunks, -1])
+
+    squares = tf.square(splits)
+    means = tf.reduce_mean(squares, axis=1)
+
+    # the statistics functions must execute before tf.log(), or the gain db would be wrong
+    if not isinstance(reduce_funcs, list):
+        reduces = reduce_funcs(means)
+        return DBFS_COEF * tf.math.log(reduces + 1e-8)
+
+    reduces = [reduce_func(means) for reduce_func in reduce_funcs]
+    return [DBFS_COEF * tf.math.log(reduce + 1e-8) for reduce in reduces]
+
+
+def create_noise_iterator(noise_sources, read_csvs_func):
+    r"""Create an iterator to yield audio
+
+    Args:
+        noise_dirs_or_files: A list/tuple of str, the collection source of wav filenames.
+        read_csvs_func: A function, please specify the `read_csvs()` function from `util/feeding.py`, which is to prevent recursive import error.
+
+    Returns:
+        An one shot iterator of audio with 2-D Tensor of shape [`time-step`, 1], use `<iter>.get_next()` to get the Tensor.
+    """
+    if isinstance(noise_sources, str):
+        noise_sources = noise_sources.split(',')
+
+    noise_filenames = tf.convert_to_tensor(list(collect_noise_filenames(noise_sources, read_csvs_func)), dtype=tf.string)
+    log_info("Collect {} noise files for mixing audio".format(noise_filenames.shape[0]))
+
+    noise_dataset = (tf.data.Dataset.from_tensor_slices(noise_filenames)
+                     .shuffle(min(noise_filenames.shape[0], 102400))
+                     .map(filename_to_audio, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+                     .prefetch(tfv1.data.experimental.AUTOTUNE)
+                     .repeat())
+    noise_iterator = tfv1.data.make_one_shot_iterator(noise_dataset)
+    return noise_iterator
+
+
+def collect_noise_filenames(sources, read_csvs_func):
+    r"""Collect wav filenames from directories or csv files
+
+    Args:
+        dirs_or_files: A list/tuple of str, the collection source of wav filenames.
+        read_csvs_func: A function, please specify the `read_csvs()` function from `util/feeding.py`, which is to prevent recursive import error.
+
+    Returns:
+        An iterator of str, yield every filename suffix with `.wav` or under `wav_filename` column of DataFrame
+    """
+
+    assert isinstance(sources, (list, tuple))
+
+    for source in sources:
+        assert os.path.exists(source)
+        if os.path.isdir(source):
+            for dirpath, _, filenames in os.walk(source):
+                for filename in filenames:
+                    if filename.endswith('.wav'):
+                        yield os.path.join(dirpath, filename)
+        elif os.path.isfile(source):
+            df = read_csvs_func([source])
+            for filename in df['wav_filename']:
+                yield filename
+
+
+def augment_noise(audio,
+                  noise_iterator=None,
+                  speech_iterator=None,
+                  min_n_noises=0,
+                  max_n_noises=1,
+                  min_n_speakers=0,
+                  max_n_speakers=1,
+                  min_audio_dbfs=-35.0,
+                  max_audio_dbfs=0.0,
+                  min_noise_snr_db=3.0,
+                  max_noise_snr_db=30.0,
+                  min_speech_snr_db=3.0,
+                  max_speech_snr_db=30.0,
+                  limit_audio_peak_dbfs=7.0,
+                  limit_noise_peak_dbfs=3.0,
+                  limit_speech_peak_dbfs=7.0,
+                  sample_rate=16000):
+    r"""Mix audio Tensor with noise Tensor
+
+    If the noise length is shorter than audio, the process will automaticaly repeat the noise file to over audio length,
+    The process randomly choose a duration of the noise to complete coverage the audio,
+    i.e. the shapes between the choosen duration of noise and audio are equal.
+
+    Args:
+        audio: A 2-D Tensor with shape [`time-steps`, 1].
+        noise_iterator: A one shot iterator for noise file, the yield item shape is [`time-steps`, 1].
+        speech_iterator: A one shot iterator for speech file, the yield item shape is [`time-steps`, 1].
+        min_n_noises: A int, min number of the noises per audio mixing
+        max_n_noises: A int, 'max number of the noises per audio mixing
+        min_n_speakers: A int, min number of the speakers per audio mixing
+        max_n_speakers: A int, max number of the speakers per audio mixing
+        min_audio_dbfs: A float in dbfs unit, specifying the `minimum` volume of audio during gaining audio.
+        max_audio_dbfs: A float in dbfs unit, specifying the `maximum` volume of audio during gaining audio.
+        min_noise_snr_db: A float in db unit, specifying the `minimum` signal-to-noise ratio during gaining noise.
+        max_noise_snr_db: A float in db unit, specifying the `maximum` signal-to-noise ratio during gaining noise.
+        min_speech_snr_db: A float in db unit, specifying the `minimum` signal-to-noise ratio during gaining speech.
+        max_speech_snr_db: A float in db unit, specifying the `maximum` signal-to-noise ratio during gaining speech.
+        limit_audio_peak_dbfs: A float, specifying the limitation of maximun `audio` dbfs of chunks, the audio volume will not gain over than the specified value.
+        limit_noise_peak_dbfs: A float, specifying the limitation of maximun `noise` dbfs of chunks, the noise volume will not gain over than the specified value.
+        limit_speech_peak_dbfs: A float, specifying the limitation of maximun `speech` dbfs of chunks, the noise volume will not gain over than the specified value.
+        sample_rate: An integer, specifying the audio sample rate to determining the chunk size for dbfs measurement.
+
+    Returns:
+        A 2-D Tensor with shape [`time-steps`, 1]. Has the same type and shape as `audio`.
+    """
+
+    audio_len = tf.shape(audio)[0]
+    audio_mean_dbfs, audio_max_dbfs = audio_to_dbfs(audio, sample_rate, reduce_funcs=[tf.reduce_mean, tf.reduce_max])
+    target_audio_dbfs = tfv1.random_uniform([], minval=min_audio_dbfs, maxval=max_audio_dbfs)
+    audio_gain_db = target_audio_dbfs - audio_mean_dbfs
+
+    # limit audio peak
+    audio_gain_db = tf.minimum(limit_audio_peak_dbfs - audio_max_dbfs, audio_gain_db)
+    target_audio_dbfs = audio_mean_dbfs + audio_gain_db
+    audio_gain_ratio = tf.math.pow(10.0, audio_gain_db / 20.0)
+    mixed_audio = tf.multiply(audio, audio_gain_ratio)
+
+
+    if noise_iterator:
+        n_noise = tfv1.random_uniform([], minval=min_n_noises, maxval=max_n_noises, dtype=tf.int32) if min_n_noises != max_n_noises else min_n_noises
+        def mix_noise_func(au):
+            noise = noise_iterator.get_next()
+            noise, noise_mean_dbfs, noise_max_dbfs = extract_noise(noise, audio_len, sample_rate)
+            return mix(au, target_audio_dbfs, noise, noise_mean_dbfs, noise_max_dbfs, min_noise_snr_db, max_noise_snr_db, limit_noise_peak_dbfs)
+        mixed_audio = tf.while_loop(lambda _: True, mix_noise_func, [mixed_audio], maximum_iterations=n_noise)
+
+    if speech_iterator:
+        n_speakers = tfv1.random_uniform([], minval=min_n_speakers, maxval=max_n_speakers, dtype=tf.int32) if min_n_speakers != max_n_speakers else min_n_speakers
+        def mix_speech_func(au):
+            speech = speech_iterator.get_next()
+            speech, speech_mean_dbfs, speech_max_dbfs = extract_noise(speech, audio_len, sample_rate)
+            return mix(au, target_audio_dbfs, speech, speech_mean_dbfs, speech_max_dbfs, min_speech_snr_db, max_speech_snr_db, limit_speech_peak_dbfs)
+        mixed_audio = tf.while_loop(lambda _: True, mix_speech_func, [mixed_audio], maximum_iterations=n_speakers)
+
+    mixed_audio = tf.maximum(tf.minimum(mixed_audio, 1.0), -1.0)
+
+    return mixed_audio
+
+def extract_noise(noise, audio_len, sample_rate=16000):
+    r"""to prepare the mixable noise file out
+
+    Args:
+        noise: A 2-D Tensor with shape [`time-steps`, 1]
+        audio_len: A tf.int32 scalar, the audio length
+        sample_rate: An integer, specifying the audio sample rate to determining the chunk size for dbfs measurement.
+    Returns:
+        A 2-D Tensor with shape [`audio_len`, 1].
+        A float, the extracted noise mean dbfs
+        A float, the extracted noise max dbfs
+    """
+    noise_len = tf.shape(noise)[0]
+    multiply = tf.math.floordiv(audio_len, noise_len) + 1
+    noise_tile = tf.tile(noise, [multiply, 1])
+
+    # Now, noise_len must > audio_len
+    noise_tile_len = tf.shape(noise_tile)[0]
+
+    mix_decoded_start_point = tfv1.random_uniform([], minval=0, maxval=noise_tile_len-audio_len, dtype=tf.int32)
+    mix_decoded_end_point = mix_decoded_start_point + audio_len
+    extracted_noise = noise_tile[mix_decoded_start_point:mix_decoded_end_point, :]
+    extracted_noise_mean_dbfs, extracted_noise_max_dbfs = audio_to_dbfs(extracted_noise, sample_rate, reduce_funcs=[tf.reduce_mean, tf.reduce_max])
+    return extracted_noise, extracted_noise_mean_dbfs, extracted_noise_max_dbfs
+
+def mix(audio, audio_dbfs, noise, noise_mean_dbfs, noise_max_dbfs, min_noise_snr_db, max_noise_snr_db, limit_noise_peak_dbfs):
+    r"""The input audio len must equal to noise len
+
+    Returns:
+        A 2-D Tensor with shape [`time-steps`, 1]. Has the same type and shape as `audio`.
+    """
+
+    # target_snr_db := target_audio_dbfs - target_noise_dbfs
+    target_snr_db = tfv1.random_uniform([], minval=min_noise_snr_db, maxval=max_noise_snr_db)
+
+    target_noise_dbfs = audio_dbfs - target_snr_db
+    noise_gain_db = target_noise_dbfs - noise_mean_dbfs
+
+    # limit noise peak
+    noise_gain_db = tf.minimum(limit_noise_peak_dbfs - noise_max_dbfs, noise_gain_db)
+    noise_gain_ratio = tf.math.pow(10.0, noise_gain_db / 20.0)
+
+    audio += tf.multiply(noise, noise_gain_ratio)
+    return audio
+
+def gla(spectrogram, n_iter=10):
+    r"""Use Griffin-Lim algorithm to reconstruct audio
+
+    Args:
+        spectrogram: A 3-D Tensor with shape [1, `time-steps`, `features`].
+    Returns:
+        A 2-D Tensor with shape [`time-steps`, 1], which is a reconstructed audio from spectrogram.
+    """
+    frame_length = int(Config.audio_window_samples)
+    frame_step = int(Config.audio_step_samples)
+    fft_length = 512
+    spectrogram = tf.reshape(spectrogram, shape=[1, -1, 257])
+    abs_spectrogram = tf.abs(spectrogram)
+
+    def reconstruct_phases(prev_phases):
+        xi = tf.complex(abs_spectrogram, 0.0) * prev_phases
+        audio = tf.signal.inverse_stft(xi, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length)
+        next_xi = tf.signal.stft(audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length)
+        next_phases = tf.math.exp(tf.complex(0.0, tf.angle(next_xi)))
+        return next_phases
+
+    rands = tfv1.random_uniform(tf.shape(spectrogram), dtype=tf.float32)
+    phases = tf.math.exp(tf.complex(0.0, 2.0 * np.pi * rands))
+
+    reconstructed_phases = tf.while_loop(lambda _: True, reconstruct_phases, [phases], maximum_iterations=n_iter)
+    xi = tf.complex(abs_spectrogram, 0.0) * reconstructed_phases
+    audio = tf.signal.inverse_stft(xi, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length)
+    return tf.transpose(audio)
diff --git a/training/deepspeech_training/util/feeding.py b/training/deepspeech_training/util/feeding.py
index 79c2da75d0..2d272c6c78 100644
--- a/training/deepspeech_training/util/feeding.py
+++ b/training/deepspeech_training/util/feeding.py
@@ -2,8 +2,10 @@
 from __future__ import absolute_import, division, print_function
 
 from functools import partial
+import os
 
 import numpy as np
+import pandas
 import tensorflow as tf
 
 from tensorflow.python.ops import gen_audio_ops as contrib_audio
@@ -15,6 +17,19 @@
 from .audio import change_audio_types, read_frames_from_file, vad_split, pcm_to_np, DEFAULT_FORMAT, AUDIO_TYPE_NP
 from .sample_collections import samples_from_files
 from .helpers import remember_exception, MEGABYTE
+from .audio_augmentation import augment_noise, create_noise_iterator, gla
+
+
+def read_csvs(csv_files):
+    sets = []
+    for csv in csv_files:
+        file = pandas.read_csv(csv, encoding='utf-8', na_filter=False)
+        #FIXME: not cross-platform
+        csv_dir = os.path.dirname(os.path.abspath(csv))
+        file['wav_filename'] = file['wav_filename'].str.replace(r'(^[^/])', lambda m: os.path.join(csv_dir, m.group(1))) # pylint: disable=cell-var-from-loop
+        sets.append(file)
+    # Concat all sets, drop any extra columns, re-index the final result as 0..N
+    return pandas.concat(sets, join='inner', ignore_index=True)
 
 
 def samples_to_mfccs(samples, sample_rate, train_phase=False, sample_id=None):
@@ -69,12 +84,43 @@ def samples_to_mfccs(samples, sample_rate, train_phase=False, sample_id=None):
                                upper_frequency_limit=FLAGS.audio_sample_rate/2)
     mfccs = tf.reshape(mfccs, [-1, Config.n_input])
 
-    return mfccs, tf.shape(input=mfccs)[0]
-
-
-def audio_to_features(audio, sample_rate, train_phase=False, sample_id=None):
-    features, features_len = samples_to_mfccs(audio, sample_rate, train_phase=train_phase, sample_id=sample_id)
-
+    review_audio = samples
+    if FLAGS.review_audio_steps and train_phase and any([
+                FLAGS.augmentation_spec_dropout_keeprate < 1,
+                FLAGS.augmentation_freq_and_time_masking,
+                FLAGS.augmentation_pitch_and_tempo_scaling,
+                FLAGS.augmentation_speed_up_std > 0]):
+        review_audio = gla(spectrogram, FLAGS.review_audio_gla_iterations)
+
+    return mfccs, tf.shape(input=mfccs)[0], review_audio
+
+def audio_to_features(audio, sample_rate, train_phase=False, sample_id=None, noise_iterator=None, speech_iterator=None):
+
+    # augment audio
+    if noise_iterator or speech_iterator:
+        audio = augment_noise(
+            audio,
+            noise_iterator,
+            speech_iterator,
+            min_n_noises=FLAGS.audio_aug_min_n_noises,
+            max_n_noises=FLAGS.audio_aug_max_n_noises,
+            min_n_speakers=FLAGS.audio_aug_min_n_speakers,
+            max_n_speakers=FLAGS.audio_aug_max_n_speakers,
+            min_audio_dbfs=FLAGS.audio_aug_min_audio_dbfs,
+            max_audio_dbfs=FLAGS.audio_aug_max_audio_dbfs,
+            min_noise_snr_db=FLAGS.audio_aug_min_noise_snr_db,
+            max_noise_snr_db=FLAGS.audio_aug_max_noise_snr_db,
+            min_speech_snr_db=FLAGS.audio_aug_min_speech_snr_db,
+            max_speech_snr_db=FLAGS.audio_aug_max_speech_snr_db,
+            limit_audio_peak_dbfs=FLAGS.audio_aug_limit_audio_peak_dbfs,
+            limit_noise_peak_dbfs=FLAGS.audio_aug_limit_noise_peak_dbfs,
+            limit_speech_peak_dbfs=FLAGS.audio_aug_limit_speech_peak_dbfs,
+            sample_rate=FLAGS.audio_sample_rate,
+        )
+
+    features, features_len, review_audio = samples_to_mfccs(audio, sample_rate, train_phase=train_phase, sample_id=sample_id)
+
+    # augment features
     if train_phase:
         if FLAGS.data_aug_features_multiplicative > 0:
             features = features*tf.random.normal(mean=1, stddev=FLAGS.data_aug_features_multiplicative, shape=tf.shape(features))
@@ -82,20 +128,20 @@ def audio_to_features(audio, sample_rate, train_phase=False, sample_id=None):
         if FLAGS.data_aug_features_additive > 0:
             features = features+tf.random.normal(mean=0.0, stddev=FLAGS.data_aug_features_additive, shape=tf.shape(features))
 
-    return features, features_len
+    return features, features_len, review_audio
 
 
-def audiofile_to_features(wav_filename, train_phase=False):
+def audiofile_to_features(wav_filename, train_phase=False, noise_iterator=None, speech_iterator=None):
     samples = tf.io.read_file(wav_filename)
     decoded = contrib_audio.decode_wav(samples, desired_channels=1)
-    return audio_to_features(decoded.audio, decoded.sample_rate, train_phase=train_phase, sample_id=wav_filename)
+    return audio_to_features(decoded.audio, decoded.sample_rate, train_phase=train_phase, sample_id=wav_filename, noise_iterator=noise_iterator, speech_iterator=speech_iterator)
 
 
-def entry_to_features(sample_id, audio, sample_rate, transcript, train_phase=False):
+def entry_to_features(sample_id, audio, sample_rate, transcript, train_phase=False, noise_iterator=None, speech_iterator=None):
     # https://bugs.python.org/issue32117
-    features, features_len = audio_to_features(audio, sample_rate, train_phase=train_phase, sample_id=sample_id)
+    features, features_len, review_audio = audio_to_features(audio, sample_rate, train_phase=train_phase, sample_id=sample_id, noise_iterator=noise_iterator, speech_iterator=speech_iterator)
     sparse_transcript = tf.SparseTensor(*transcript)
-    return sample_id, features, features_len, sparse_transcript
+    return sample_id, features, features_len, sparse_transcript, review_audio
 
 
 def to_sparse_tuple(sequence):
@@ -114,7 +160,9 @@ def create_dataset(sources,
                    train_phase=False,
                    exception_box=None,
                    process_ahead=None,
-                   buffering=1 * MEGABYTE):
+                   buffering=1 * MEGABYTE,
+                   noise_sources=None,
+                   speech_sources=None):
     def generate_values():
         samples = samples_from_files(sources, buffering=buffering, labeled=True)
         for sample in change_audio_types(samples,
@@ -131,14 +179,25 @@ def sparse_reshape(sparse):
         shape = sparse.dense_shape
         return tf.sparse.reshape(sparse, [shape[0], shape[2]])
 
-    def batch_fn(sample_ids, features, features_len, transcripts):
+    def batch_fn(sample_ids, features, features_len, transcripts, review_audios):
         features = tf.data.Dataset.zip((features, features_len))
         features = features.padded_batch(batch_size, padded_shapes=([None, Config.n_input], []))
         transcripts = transcripts.batch(batch_size).map(sparse_reshape)
         sample_ids = sample_ids.batch(batch_size)
-        return tf.data.Dataset.zip((sample_ids, features, transcripts))
 
-    process_fn = partial(entry_to_features, train_phase=train_phase)
+        # In order not to waste too much prefetch performance, randomly extract only `one` audio for each step
+        if FLAGS.review_audio_steps and batch_size > 1:
+            skip_size = tf.random.uniform(shape=[], minval=0, maxval=batch_size - 1, dtype=tf.int64)
+            review_audio = review_audios.skip(skip_size).batch(1)
+        else:
+            review_audio = review_audios.batch(1)
+
+        return tf.data.Dataset.zip((sample_ids, features, transcripts, review_audio))
+
+    noise_iterator = create_noise_iterator(noise_sources, read_csvs) if noise_sources else None
+    speech_iterator = create_noise_iterator(speech_sources, read_csvs) if speech_sources else None
+
+    process_fn = partial(entry_to_features, train_phase=train_phase, noise_iterator=noise_iterator, speech_iterator=speech_iterator)
 
     dataset = (tf.data.Dataset.from_generator(remember_exception(generate_values, exception_box),
                                               output_types=(tf.string, tf.float32, tf.int32,
@@ -167,7 +226,7 @@ def generate_values():
             yield time_start, time_end, samples
 
     def to_mfccs(time_start, time_end, samples):
-        features, features_len = samples_to_mfccs(samples, audio_format[0])
+        features, features_len, _ = samples_to_mfccs(samples, audio_format[0])
         return time_start, time_end, features, features_len
 
     def create_batch_set(bs, criteria):
@@ -185,3 +244,9 @@ def create_batch_set(bs, criteria):
     dataset = nds.concatenate(ods)
     dataset = dataset.prefetch(len(Config.available_devices))
     return dataset
+
+
+def secs_to_hours(secs):
+    hours, remainder = divmod(secs, 3600)
+    minutes, seconds = divmod(remainder, 60)
+    return '%d:%02d:%02d' % (hours, minutes, seconds)
diff --git a/training/deepspeech_training/util/flags.py b/training/deepspeech_training/util/flags.py
index 63bc7bb4ed..9d415c7a88 100644
--- a/training/deepspeech_training/util/flags.py
+++ b/training/deepspeech_training/util/flags.py
@@ -25,6 +25,27 @@ def create_flags():
     # Data Augmentation
     # ================
 
+    f.DEFINE_string('train_augmentation_noise_files', '', 'comma separated list of files or dirs, specifying the noise dataset used for mixing audio into train dataset, if empty, no mix will be run in train phase')
+    f.DEFINE_string('dev_augmentation_noise_files', '', 'comma separated list of files or dirs, specifying the noise dataset used for mixing audio into dev dataset, if empty, no mix will be run in dev phase')
+    f.DEFINE_string('test_augmentation_noise_files', '', 'comma separated list of files or dirs, specifying the noise dataset used for mixing audio into test dataset, if empty, no mix will be run in test phase')
+    f.DEFINE_string('train_augmentation_speech_files', '', 'comma separated list of files or dirs, specifying the speech dataset used for mixing audio into train dataset, if empty, no mix will be run in train phase')
+    f.DEFINE_string('dev_augmentation_speech_files', '', 'comma separated list of files or dirs, specifying the speech dataset used for mixing audio into dev dataset, if empty, no mix will be run in dev phase')
+    f.DEFINE_string('test_augmentation_speech_files', '', 'comma separated list of files or dirs, specifying the speech dataset used for mixing audio into test dataset, if empty, no mix will be run in test phase')
+    f.DEFINE_float('audio_aug_min_audio_dbfs', -35, 'min value of dbfs to specify the min volume of audio during gaining audio')
+    f.DEFINE_float('audio_aug_max_audio_dbfs', 0, 'max value of dbfs to specify the max volume of audio during gaining audio')
+    f.DEFINE_float('audio_aug_min_noise_snr_db', 3, 'min value of db to specify the min signal-to-noise ratio during gaining noise')
+    f.DEFINE_float('audio_aug_max_noise_snr_db', 30, 'max value of db to specify the max signal-to-noise ratio during gaining noise')
+    f.DEFINE_float('audio_aug_min_speech_snr_db', 10, 'min value of db to specify the min signal-to-noise ratio during gaining speech')
+    f.DEFINE_float('audio_aug_max_speech_snr_db', 50, 'max value of db to specify the max signal-to-noise ratio during gaining speech')
+    f.DEFINE_float('audio_aug_limit_audio_peak_dbfs', 7.0, 'max value of dbfs to specify the limitation of max audio dbfs of chunks, the audio volume will not gain over than the specified value')
+    f.DEFINE_float('audio_aug_limit_noise_peak_dbfs', 3.0, 'max value of dbfs to specify the limitation of max noise dbfs of chunks, the noise volume will not gain over than the specified value')
+    f.DEFINE_float('audio_aug_limit_speech_peak_dbfs', 3.0, 'max value of dbfs to specify the limitation of max speech dbfs of chunks, the speech volume will not gain over than the specified value')
+    f.DEFINE_integer('audio_aug_min_n_noises', 0, 'min number of the noises per audio mixing')
+    f.DEFINE_integer('audio_aug_max_n_noises', 1, 'max number of the noises per audio mixing')
+    f.DEFINE_integer('audio_aug_min_n_speakers', 0, 'min number of the speakers per audio mixing')
+    f.DEFINE_integer('audio_aug_max_n_speakers', 1, 'max number of the speakers per audio mixing')
+
+
     f.DEFINE_float('data_aug_features_additive', 0, 'std of the Gaussian additive noise')
     f.DEFINE_float('data_aug_features_multiplicative', 0, 'std of normal distribution around 1 for multiplicative noise')
 
@@ -50,6 +71,9 @@ def create_flags():
     f.DEFINE_float('augmentation_pitch_and_tempo_scaling_max_pitch', 1.2, 'max value of pitch scaling')
     f.DEFINE_float('augmentation_pitch_and_tempo_scaling_max_tempo', 1.2, 'max vlaue of tempo scaling')
 
+    f.DEFINE_integer('review_audio_steps', 0, 'number of audio, push the audio into summary directory (if 0, no audio will be dumped), one file per step is saved until the given count is reached')
+    f.DEFINE_integer('review_audio_gla_iterations', 10, 'number of iteration to reconstruct audio from features, using Griffin-Lim Algorithm')
+
     # Global Constants
     # ================