Skip to content

Commit

Permalink
Live audio augmentation
Browse files Browse the repository at this point in the history
  • Loading branch information
tilmankamp committed May 12, 2020
1 parent 9278597 commit c5ceee2
Show file tree
Hide file tree
Showing 14 changed files with 857 additions and 113 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
*.pyc
*.swp
*.DS_Store
*.egg-info
.pit*
/.run
/werlog.js
Expand Down
13 changes: 8 additions & 5 deletions bin/build_sdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from deepspeech_training.util.downloader import SIMPLE_BAR
from deepspeech_training.util.sample_collections import (
DirectSDBWriter,
samples_from_files,
samples_from_sources,
)

AUDIO_TYPE_LOOKUP = {"wav": AUDIO_TYPE_WAV, "opus": AUDIO_TYPE_OPUS}
Expand All @@ -26,12 +26,10 @@ def build_sdb():
with DirectSDBWriter(
CLI_ARGS.target, audio_type=audio_type, labeled=not CLI_ARGS.unlabeled
) as sdb_writer:
samples = samples_from_files(CLI_ARGS.sources, labeled=not CLI_ARGS.unlabeled)
samples = samples_from_sources(CLI_ARGS.sources, labeled=not CLI_ARGS.unlabeled)
bar = progressbar.ProgressBar(max_value=len(samples), widgets=SIMPLE_BAR)
for sample in bar(
change_audio_types(
samples, audio_type=audio_type, processes=CLI_ARGS.workers
)
change_audio_types(samples, audio_type=audio_type, bitrate=CLI_ARGS.bitrate, processes=CLI_ARGS.workers)
):
sdb_writer.add(sample)

Expand All @@ -55,6 +53,11 @@ def handle_args():
choices=AUDIO_TYPE_LOOKUP.keys(),
help="Audio representation inside target SDB",
)
parser.add_argument(
"--bitrate",
type=int,
help="Bitrate for lossy compressed SDB samples like in case of --audio-type opus",
)
parser.add_argument(
"--workers", type=int, default=None, help="Number of encoding SDB workers"
)
Expand Down
66 changes: 66 additions & 0 deletions bin/compare_samples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/usr/bin/env python
"""
Tool for comparing two wav samples
"""
import sys
import argparse

from deepspeech_training.util.audio import AUDIO_TYPE_NP, mean_dbfs
from deepspeech_training.util.sample_collections import load_sample


def fail(message):
print(message, file=sys.stderr, flush=True)
sys.exit(1)


def compare_samples():
sample1 = load_sample(CLI_ARGS.sample1)
sample2 = load_sample(CLI_ARGS.sample2)
if sample1.audio_format != sample2.audio_format:
fail('Samples differ on: audio-format ({} and {})'.format(sample1.audio_format, sample2.audio_format))
if sample1.duration != sample2.duration:
fail('Samples differ on: duration ({} and {})'.format(sample1.duration, sample2.duration))
sample1.change_audio_type(AUDIO_TYPE_NP)
sample2.change_audio_type(AUDIO_TYPE_NP)
audio_diff = sample1.audio - sample2.audio
diff_dbfs = mean_dbfs(audio_diff)
differ_msg = 'Samples differ on: sample data ({:0.2f} dB difference) '.format(diff_dbfs)
equal_msg = 'Samples are considered equal ({:0.2f} dB difference)'.format(diff_dbfs)
if CLI_ARGS.if_differ:
if diff_dbfs <= CLI_ARGS.threshold:
fail(equal_msg)
if not CLI_ARGS.no_success_output:
print(differ_msg, file=sys.stderr, flush=True)
else:
if diff_dbfs > CLI_ARGS.threshold:
fail(differ_msg)
if not CLI_ARGS.no_success_output:
print(equal_msg, file=sys.stderr, flush=True)


def handle_args():
parser = argparse.ArgumentParser(
description="Tool for checking similarity of two samples"
)
parser.add_argument("sample1", help="Filename of sample 1 to compare")
parser.add_argument("sample2", help="Filename of sample 2 to compare")
parser.add_argument("--threshold", type=float, default=-60.0,
help="dB of sample deltas above which they are considered different")
parser.add_argument(
"--if-differ",
action="store_true",
help="If to succeed and return status code 0 on different signals and fail on equal ones (inverse check)."
"This will still fail on different formats or durations.",
)
parser.add_argument(
"--no-success-output",
action="store_true",
help="Stay silent on success (if samples are equal of - with --if-differ - samples are not equal)",
)
return parser.parse_args()


if __name__ == "__main__":
CLI_ARGS = handle_args()
compare_samples()
110 changes: 76 additions & 34 deletions bin/play.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,72 @@
#!/usr/bin/env python
"""
Tool for playing samples from Sample Databases (SDB files) and DeepSpeech CSV files
Tool for playing (and augmenting) single samples or samples from Sample Databases (SDB files) and DeepSpeech CSV files
Use "python3 build_sdb.py -h" for help
"""

import argparse
import random
import os
import sys
import random
import argparse

from deepspeech_training.util.audio import AUDIO_TYPE_PCM
from deepspeech_training.util.sample_collections import LabeledSample, samples_from_file


def play_sample(samples, index):
if index < 0:
index = len(samples) + index
if CLI_ARGS.random:
index = random.randint(0, len(samples))
elif index >= len(samples):
print("No sample with index {}".format(CLI_ARGS.start))
sys.exit(1)
sample = samples[index]
print('Sample "{}"'.format(sample.sample_id))
if isinstance(sample, LabeledSample):
print(' "{}"'.format(sample.transcript))
sample.change_audio_type(AUDIO_TYPE_PCM)
rate, channels, width = sample.audio_format
wave_obj = simpleaudio.WaveObject(sample.audio, channels, width, rate)
play_obj = wave_obj.play()
play_obj.wait_done()
from deepspeech_training.util.audio import LOADABLE_AUDIO_EXTENSIONS, AUDIO_TYPE_PCM, AUDIO_TYPE_WAV
from deepspeech_training.util.sample_collections import SampleList, LabeledSample, samples_from_source, prepare_samples


def play_collection():
samples = samples_from_file(CLI_ARGS.collection, buffering=0)
def get_samples_in_play_order():
ext = os.path.splitext(CLI_ARGS.source)[1].lower()
if ext in LOADABLE_AUDIO_EXTENSIONS:
samples = SampleList([(CLI_ARGS.source, 0)], labeled=False)
else:
samples = samples_from_source(CLI_ARGS.source, buffering=0)
played = 0
index = CLI_ARGS.start
while True:
if 0 <= CLI_ARGS.number <= played:
return
play_sample(samples, index)
if CLI_ARGS.random:
yield samples[random.randint(0, len(samples) - 1)]
elif index < 0:
yield samples[len(samples) + index]
elif index >= len(samples):
print("No sample with index {}".format(CLI_ARGS.start))
sys.exit(1)
else:
yield samples[index]
played += 1
index = (index + 1) % len(samples)


def play_collection():
samples = get_samples_in_play_order()
samples = prepare_samples(samples,
audio_type=AUDIO_TYPE_PCM,
augmentation_specs=CLI_ARGS.augment,
process_ahead=0,
fixed_clock=CLI_ARGS.clock)
for sample in samples:
if not CLI_ARGS.quiet:
print('Sample "{}"'.format(sample.sample_id), file=sys.stderr)
if isinstance(sample, LabeledSample):
print(' "{}"'.format(sample.transcript), file=sys.stderr)
if CLI_ARGS.pipe:
sample.change_audio_type(AUDIO_TYPE_WAV)
sys.stdout.buffer.write(sample.audio.getvalue())
return
wave_obj = simpleaudio.WaveObject(sample.audio,
sample.audio_format.channels,
sample.audio_format.width,
sample.audio_format.rate)
play_obj = wave_obj.play()
play_obj.wait_done()


def handle_args():
parser = argparse.ArgumentParser(
description="Tool for playing samples from Sample Databases (SDB files) "
description="Tool for playing (and augmenting) single samples or samples from Sample Databases (SDB files) "
"and DeepSpeech CSV files"
)
parser.add_argument("collection", help="Sample DB or CSV file to play samples from")
parser.add_argument("source", help="Sample DB, CSV or WAV file to play samples from")
parser.add_argument(
"--start",
type=int,
Expand All @@ -66,16 +84,40 @@ def handle_args():
action="store_true",
help="If samples should be played in random order",
)
parser.add_argument(
"--augment",
action='append',
help="Add an augmentation operation",
)
parser.add_argument(
"--clock",
type=float,
default=0.5,
help="Simulates clock value used for augmentations during training."
"Ranges from 0.0 (representing parameter start values) to"
"1.0 (representing parameter end values)",
)
parser.add_argument(
"--pipe",
action="store_true",
help="Pipe first sample as wav file to stdout. Forces --number to 1.",
)
parser.add_argument(
"--quiet",
action="store_true",
help="No info logging to console",
)
return parser.parse_args()


if __name__ == "__main__":
try:
import simpleaudio
except ModuleNotFoundError:
print('play.py requires Python package "simpleaudio"')
sys.exit(1)
CLI_ARGS = handle_args()
if not CLI_ARGS.pipe:
try:
import simpleaudio
except ModuleNotFoundError:
print('Unless using the --pipe flag, play.py requires Python package "simpleaudio" for playing samples')
sys.exit(1)
try:
play_collection()
except KeyboardInterrupt:
Expand Down
66 changes: 66 additions & 0 deletions bin/run-tc-signal_augmentations.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/bin/sh

set -xe

ldc93s1_dir=`cd data/smoke_test; pwd`
ldc93s1_csv="${ldc93s1_dir}/LDC93S1.csv"
ldc93s1_wav="${ldc93s1_dir}/LDC93S1.wav"
ldc93s1_overlay_csv="${ldc93s1_dir}/LDC93S1_overlay.csv"
ldc93s1_overlay_wav="${ldc93s1_dir}/LDC93S1_reversed.wav"

play="python bin/play.py --number 1 --quiet"
compare="python bin/compare_samples.py --no-success-output"

if [ ! -f "${ldc93s1_csv}" ]; then
echo "Downloading and preprocessing LDC93S1 example data, saving in ${ldc93s1_dir}."
python -u bin/import_ldc93s1.py ${ldc93s1_dir}
fi;

if [ ! -f "${ldc93s1_overlay_csv}" ]; then
echo "Reversing ${ldc93s1_wav} to ${ldc93s1_overlay_wav}."
sox "${ldc93s1_wav}" "${ldc93s1_overlay_wav}" reverse

echo "Creating ${ldc93s1_overlay_csv}."
printf "wav_filename\n${ldc93s1_overlay_wav}" > "${ldc93s1_overlay_csv}"
fi;

if ! $compare --if-differ "${ldc93s1_wav}" "${ldc93s1_overlay_wav}"; then
echo "Sample comparison tool not working correctly"
exit 1
fi

$play ${ldc93s1_wav} --augment overlay[source="${ldc93s1_overlay_csv}",snr=20] --pipe >/tmp/overlay-test.wav
if ! $compare --if-differ "${ldc93s1_wav}" /tmp/overlay-test.wav; then
echo "Overlay augmentation had no effect or changed basic sample properties"
exit 1
fi

$play ${ldc93s1_wav} --augment reverb[delay=50.0,decay=2.0] --pipe >/tmp/reverb-test.wav
if ! $compare --if-differ "${ldc93s1_wav}" /tmp/reverb-test.wav; then
echo "Reverb augmentation had no effect or changed basic sample properties"
exit 1
fi

$play ${ldc93s1_wav} --augment gaps[n=10,size=100.0] --pipe >/tmp/gaps-test.wav
if ! $compare --if-differ "${ldc93s1_wav}" /tmp/gaps-test.wav; then
echo "Gaps augmentation had no effect or changed basic sample properties"
exit 1
fi

$play ${ldc93s1_wav} --augment resample[rate=4000] --pipe >/tmp/resample-test.wav
if ! $compare --if-differ "${ldc93s1_wav}" /tmp/resample-test.wav; then
echo "Resample augmentation had no effect or changed basic sample properties"
exit 1
fi

$play ${ldc93s1_wav} --augment codec[bitrate=4000] --pipe >/tmp/codec-test.wav
if ! $compare --if-differ "${ldc93s1_wav}" /tmp/codec-test.wav; then
echo "Codec augmentation had no effect or changed basic sample properties"
exit 1
fi

$play ${ldc93s1_wav} --augment volume --pipe >/tmp/volume-test.wav
if ! $compare --if-differ "${ldc93s1_wav}" /tmp/volume-test.wav; then
echo "Volume augmentation had no effect or changed basic sample properties"
exit 1
fi
Loading

0 comments on commit c5ceee2

Please sign in to comment.