-
Notifications
You must be signed in to change notification settings - Fork 4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
online mix noise audio data in training step #2622
base: master
Are you sure you want to change the base?
Changes from all commits
681f470
421243d
d08efad
b0a14b5
ba1a587
aebd08d
d255c3f
ec25136
484134e
4f24f08
1f57ece
66cc7c4
b7eb0f4
ccae7cc
8cc95f9
9e2648a
2269514
0b8147c
42bc45b
289722d
9334e79
25736e0
40b431b
f7d1279
7792226
c4c3ced
c151b1d
c089b7f
491a4b0
735cbbb
2fa91e8
6b820bb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
from __future__ import absolute_import, division, print_function | ||
|
||
# Make sure we can import stuff from util/ | ||
# This script needs to be run from the root of the DeepSpeech repository | ||
|
||
from librosa import get_duration | ||
from multiprocessing import Pool | ||
from functools import partial | ||
import math | ||
import argparse | ||
import sys | ||
import os | ||
import progressbar | ||
sys.path.insert(1, os.path.join(sys.path[0], '..')) | ||
|
||
from util.feeding import secs_to_hours | ||
|
||
try: | ||
from pydub import AudioSegment | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We want to keep dependencies at a minimum. Please check, if your required functionality couldn't be covered by e.g. |
||
except ImportError as err: | ||
print('[ImportError] try `sudo apt-get install ffmpeg && pip install pydub`') | ||
raise err | ||
|
||
|
||
def detect_silence(sound: AudioSegment, silence_threshold=-50.0, chunk_size=10): | ||
start_trim = 0 # ms | ||
sound_size = len(sound) | ||
assert chunk_size > 0 # to avoid infinite loop | ||
while sound[start_trim:(start_trim + chunk_size)].dBFS < silence_threshold and start_trim < sound_size: | ||
start_trim += chunk_size | ||
|
||
end_trim = sound_size | ||
while sound[(end_trim - chunk_size):end_trim].dBFS < silence_threshold and end_trim > 0: | ||
end_trim -= chunk_size | ||
|
||
start_trim = min(sound_size, start_trim) | ||
end_trim = max(0, end_trim) | ||
|
||
return min([start_trim, end_trim]), max([start_trim, end_trim]) | ||
|
||
|
||
def trim_silence_audio(sound: AudioSegment, silence_threshold=-50.0, chunk_size=10): | ||
start_trim, end_trim = detect_silence(sound, silence_threshold, chunk_size) | ||
return sound[start_trim:end_trim] | ||
|
||
|
||
def convert(filename, dst_dirpath, dirpath, normalize, trim_silence, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please check, if how this is covered by or could be merged into the current audio.py. |
||
min_duration_seconds, max_duration_seconds): | ||
if not filename.endswith(('.wav', '.raw')): | ||
return | ||
|
||
filepath = os.path.join(dirpath, filename) | ||
if filename.endswith('.wav'): | ||
sound: AudioSegment = AudioSegment.from_file(filepath) | ||
else: | ||
try: | ||
sound: AudioSegment = AudioSegment.from_raw(filepath, | ||
sample_width=2, | ||
frame_rate=44100, | ||
channels=1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please take |
||
except Exception as err: # pylint: disable=broad-except | ||
print('Retrying conversion: {}'.format(err)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why? |
||
try: | ||
sound: AudioSegment = AudioSegment.from_raw(filepath, | ||
sample_width=2, | ||
frame_rate=48000, | ||
channels=1) | ||
except Exception as err: # pylint: disable=broad-except | ||
print('Skipping file {}, got error: {}'.format(filepath, err)) | ||
return | ||
try: | ||
sound = sound.set_frame_rate(16000) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please make this command-line configurable. |
||
except Exception as err: # pylint: disable=broad-except | ||
print('Skipping {}'.format(err)) | ||
return | ||
|
||
n_splits = max(1, math.ceil(sound.duration_seconds / max_duration_seconds)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Great idea to split noise into chunks to limit wasted overlap during augmentation! |
||
chunk_duration_ms = math.ceil(len(sound) / n_splits) | ||
chunks = [] | ||
|
||
for i in range(n_splits): | ||
end_ms = min((i + 1) * chunk_duration_ms, len(sound)) | ||
chunk = sound[(i * chunk_duration_ms):end_ms] | ||
chunks.append(chunk) | ||
|
||
for i, chunk in enumerate(chunks): | ||
dst_path = os.path.join(dst_dirpath, str(i) + '_' + filename) | ||
if dst_path.endswith('.raw'): | ||
dst_path = dst_path[:-4] + '.wav' | ||
|
||
if os.path.exists(dst_path): | ||
print('Audio already exists: {}'.format(dst_path)) | ||
return | ||
|
||
if normalize: | ||
chunk = chunk.normalize() | ||
if chunk.dBFS < -30.0: | ||
chunk = chunk.compress_dynamic_range().normalize() | ||
if chunk.dBFS < -30.0: | ||
chunk = chunk.compress_dynamic_range().normalize() | ||
if trim_silence: | ||
chunk = trim_silence_audio(chunk) | ||
|
||
if chunk.duration_seconds < min_duration_seconds: | ||
return | ||
chunk.export(dst_path, format='wav') | ||
|
||
|
||
def get_noise_duration(dst_dir): | ||
duration = 0.0 | ||
file_num = 0 | ||
for dirpath, _, filenames in os.walk(dst_dir): | ||
for f in filenames: | ||
if not f.endswith('.wav'): | ||
continue | ||
duration += get_duration(filename=os.path.join(dirpath, f)) | ||
file_num += 1 | ||
return duration, file_num | ||
|
||
|
||
def main(src_dir, | ||
dst_dir, | ||
min_duration_seconds, | ||
max_duration_seconds, | ||
normalize=True, | ||
trim_silence=True): | ||
assert os.path.exists(src_dir) | ||
if not os.path.exists(dst_dir): | ||
os.makedirs(dst_dir, exist_ok=False) | ||
src_dir = os.path.abspath(src_dir) | ||
dst_dir = os.path.abspath(dst_dir) | ||
|
||
for dirpath, _, filenames in os.walk(src_dir): | ||
dirpath = os.path.abspath(dirpath) | ||
dst_dirpath = os.path.join( | ||
dst_dir, dirpath.replace(src_dir, '').lstrip('/')) | ||
|
||
print('Converting directory: {} -> {}'.format(dirpath, dst_dirpath)) | ||
if not os.path.exists(dst_dirpath): | ||
os.makedirs(dst_dirpath, exist_ok=False) | ||
|
||
convert_func = partial(convert, | ||
dst_dirpath=dst_dirpath, | ||
dirpath=dirpath, | ||
normalize=normalize, | ||
trim_silence=trim_silence, | ||
min_duration_seconds=min_duration_seconds, | ||
max_duration_seconds=max_duration_seconds) | ||
|
||
pool = Pool(processes=None) | ||
pbar = progressbar.ProgressBar(prefix='Preparing Noise Dataset', max_value=len(filenames)).start() | ||
for i, _ in enumerate(pool.imap_unordered(convert_func, filenames)): | ||
pbar.update(i) | ||
pbar.finish() | ||
|
||
|
||
if __name__ == "__main__": | ||
PARSER = argparse.ArgumentParser(description='Optimize noise files') | ||
PARSER.add_argument('--from_dir', help='Convert wav from directory', type=str) | ||
PARSER.add_argument('--to_dir', help='save wav to directory', type=str) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This tool should also be able to produce SDBs like our SDB tool. I'll put up a PR for changing classes |
||
PARSER.add_argument('--min_sec', help='min duration seconds of saved file', type=float, default=1.0) | ||
PARSER.add_argument('--max_sec', help='max duration seconds of saved file', type=float, default=30.0) | ||
PARSER.add_argument('--normalize', action='store_true', help='Normalize sound range, default is true', default=True) | ||
PARSER.add_argument('--trim', action='store_true', help='Trim silence, default is true', default=True) | ||
PARAMS = PARSER.parse_args() | ||
|
||
main(PARAMS.from_dir, PARAMS.to_dir, PARAMS.min_sec, PARAMS.max_sec, PARAMS.normalize, PARAMS.trim) | ||
|
||
DURATION, FILE_NUM = get_noise_duration(PARAMS.to_dir) | ||
print("Your noise dataset has {} files and a duration of {}\n".format(FILE_NUM, secs_to_hours(DURATION))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about
prepare_noise.py
?