Skip to content
This repository has been archived by the owner on Dec 1, 2021. It is now read-only.

Commit

Permalink
Add a utils file for recordings and refactor to use it.
Browse files Browse the repository at this point in the history
  • Loading branch information
hmzh-khn committed Jan 14, 2019
1 parent a95035e commit 48727ce
Show file tree
Hide file tree
Showing 2 changed files with 209 additions and 80 deletions.
160 changes: 160 additions & 0 deletions audio_preprocessing/recording_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
"""
A file containing utils for dealing with Tarteel audio recordings.
Author: Hamzah Khan
Date: Jan. 13, 2019
"""

import numpy as np
import os
import wave
import webrtcvad

DEFAULT_NON_SPEECH_THRESHOLD_FRACTION = 0.5
ALL_SURAHS = None
NUM_SURAHS = 114

def get_paths_to_all_recordings(local_download_dir):
"""
Returns a list of paths to all recordings in the given directory.
"""
return get_paths_to_surah_recordings(local_download_dir)

def get_paths_to_surah_recordings(local_download_dir, surahs=ALL_SURAHS):
"""
Returns a list of paths, in the given directory, to recordings of ayahs in the specified surahs.
"""
paths_to_audio = []

if not surahs:
surahs = 1 + np.arange(NUM_SURAHS)

for surah_num in surahs:
local_surah_dir = os.path.join(local_download_dir, "s" + str(surah_num))

for _, ayah_directories, _ in os.walk(local_surah_dir):
for ayah_directory in ayah_directories:
local_ayah_dir = os.path.join(local_surah_dir, ayah_directory)

for _, _, recording_filenames in os.walk(local_ayah_dir):
for recording_filename in recording_filenames:
local_audio_path = os.path.join(local_ayah_dir, recording_filename)

# Add the fully constructed path name to the list of paths.
paths_to_audio.append(local_audio_path)

return paths_to_audio

def get_paths_to_ayah_recordings(local_download_dir, ayahs):
"""
Returns a list of paths, in the given directory, to recordings of the specified ayahs.
"""
paths_to_audio = []

if not ayahs:
raise Exception('Invalid list of ayahs - should contain a tuples of surah-ayah pairs.')

for surah_num, ayah_num in ayahs:
local_ayah_dir = os.path.join(local_download_dir, "s" + str(surah_num), "a" + str(ayah_num))

for _, _, recording_filenames in os.walk(local_ayah_dir):
for recording_filename in recording_filenames:
local_audio_path = local_audio_path = get_path_to_recording(local_download_dir,
surah_num,
ayah_num,
recording_filename)

# Add the fully constructed path name to the list of paths.
paths_to_audio.append(local_audio_path)

return paths_to_audio

def get_path_to_recording(local_download_dir, surah_num, ayah_num, filename):
"""
Returns the path of a single recording, given the local_download_dir, the surah and ayah numbers, and the filename.
"""
local_path = os.path.join(local_download_dir, "s" + str(surah_num), "a" + str(ayah_num), filename)
return local_path

def get_path_to_recording_by_id(local_download_dir, surah_num, ayah_num, recording_id, file_extension='wav'):
"""
Returns the path of a single recording, given the local_download_dir, the surah and ayah numbers, and the recording
id.
"""
filename = ("%d_%d_%d.%s" % (surah_num, ayah_num, recording_id, file_extension))
return get_path_to_recording(local_download_dir, surah_num, ayah_num, filename)

def open_recording(path_to_audio):
"""
Returns a tuple (True, info_tuple) the audio at the given path has a proper wave header, where info_tuple contains
the audio frames, sample rate in hz, and number of channels, in that order. Returns (False, None, None, None) if the
header is invalid.
Note: As of now, proper is defined as whether the wave library can open the file.
"""
# Check for valid WAVE header.
try:
wf = wave.open(path_to_audio, "rb")
wav_bytes = wf.readframes(wf.getnframes())
sample_rate_hz = wf.getframerate()
num_channels = wf.getnchannels()
wf.close()
return (True, wav_bytes, sample_rate_hz, num_channels)

# If wave can not load the file, print an error and exit the function.
except wave.Error:
print("Invalid wave header found", path_to_audio, ", removing.")
os.remove(path_to_audio)
return (False, None, None, None)

def has_speech(wav_bytes,
sample_rate_hz,
num_channels,
non_speech_threshold_fraction=DEFAULT_NON_SPEECH_THRESHOLD_FRACTION,
verbose=False):
"""
Returns true if at least (1 - non_speech_threshold_fraction) percentage of frames contain voice activity.
Note: webrtc VAD does not currently support 44.1MHz, so we have no way of checking those files for empty audio.
"""

# Use webrtc's VAD with the lowest level of aggressiveness.
mono_channel_bytes = wav_bytes

if num_channels == 2:
# just take the left channel for simplicity purposes.
# We're just trying to get a quick sanity check, no need
# to mix the two channels.
mono_channel_bytes = b"".join([wav_bytes[i:i+2] for i in range(0, len(wav_bytes), 4)])

vad = webrtcvad.Vad(1)
frame_duration = 10 # ms
bytes_per_sample = 2 # assuming 16-bit PCM.
samples_per_vaded_chunk = (sample_rate_hz * frame_duration / 1000)
bytes_per_vaded_chunk = int(samples_per_vaded_chunk*bytes_per_sample)
num_speech_frames = 0
num_non_speech_frames = 0

for i in range(0, len(mono_channel_bytes)-bytes_per_vaded_chunk, bytes_per_vaded_chunk):
chunk_to_vad = mono_channel_bytes[i:i+bytes_per_vaded_chunk]
vad_frame_length = int(len(chunk_to_vad) / bytes_per_sample)
if (webrtcvad.valid_rate_and_frame_length(sample_rate_hz, vad_frame_length)
and vad.is_speech(chunk_to_vad, sample_rate_hz)):
num_speech_frames += 1
else:
num_non_speech_frames += 1

has_frames = (num_speech_frames + num_non_speech_frames > 0)
emptyAudio = (num_speech_frames == 0 or (num_speech_frames and num_non_speech_frames == 0))

if has_frames:
percentage_non_speech = (float(num_non_speech_frames) / float(num_non_speech_frames+num_speech_frames))
else:
# If there are no frames, return a default (positive > 0.5) number.
percentage_non_speech = NO_FRAMES_VALUE

if verbose:
print ("percentage non-speech:", percentage_non_speech,
"num_speech_frames", num_speech_frames,
"num_non_speech_frames", num_non_speech_frames)

return not emptyAudio and percentage_non_speech < non_speech_threshold_fraction
129 changes: 49 additions & 80 deletions download.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,23 @@
Modified by Hamzah Khan (khanh111) and added to the tarteel.io/Tarteel-ML on Jan. 12.
"""

import audio_preprocessing.recording_utils as recording_utils
import csv
import requests
import os
import webrtcvad
import requests
import wave
import webrtcvad

from urllib.parse import urlparse
from argparse import ArgumentParser

# Define argument constants.
TARTEEL_V1_CSV_URL = "https://www.tarteel.io/tarteel_v1.0.csv"
# TODO(khanh111) Remove once tarteel.io dataset download bug is fixed.
TARTEEL_V1_CSV_URL = "https://raw.githubusercontent.com/Tarteel-io/tarteel.io/master/audio/static/datasets/tarteel_v1.0.csv"
TARTEEL_LIVE_CSV_URL = "https://www.tarteel.io/download-full-dataset-csv"


parser = ArgumentParser(description='Tarteel Audio Downloader')
parser.add_argument('--csv_url', type=str, default=TARTEEL_V1_CSV_URL)
parser.add_argument('--local_csv_cache', type=str, default='.cache/local.csv')
Expand All @@ -26,16 +34,15 @@
parser.add_argument('--sample', type=bool, default=True)
args = parser.parse_args()

# Define constants.
TARTEEL_V1_CSV_URL = "https://www.tarteel.io/tarteel_v1.0.csv"
# TODO(khanh111) Remove once tarteel.io dataset download bug is fixed.
TARTEEL_V1_CSV_URL = "https://raw.githubusercontent.com/Tarteel-io/tarteel.io/master/audio/static/datasets/tarteel_v1.0.csv"
TARTEEL_LIVE_CSV_URL = "https://www.tarteel.io/download-full-dataset-csv"

# Define constants.
NO_FRAMES_VALUE = 1.0

DEFAULT_NON_SPEECH_THRESHOLD_FRACTION = 0.5

def downloadCSVDataset():
WEBRTCVAD_SUPPORTED_SAMPLE_RATES_HZ = [8000, 16000, 32000, 48000]

def download_csv_dataset():
print("Downloading CSV from", args.csv_url)
with requests.Session() as s:
download = s.get(args.csv_url)
Expand All @@ -47,28 +54,32 @@ def downloadCSVDataset():
file.close()
print ("Done downloading CSV.")

def parseCSV():
def parse_csv():
file = open(args.local_csv_cache, "r")
content = file.read()
cr = csv.reader(content.splitlines(), delimiter=',')
rows = list(cr)
return rows

def cachedCSVExists():
def cached_csv_exists():
return os.path.isfile(args.local_csv_cache)

def downloadAudio(row):
surah_number = row[0]
ayah_number = row[1]
def download_audio(row, local_download_dir):
"""
Downloads an audio recording given its entry in a Tarteel dataset csv.
"""
surah_num = int(row[0])
ayah_num = int(row[1])
url = "http://" + row[2]
parsed_url = urlparse(url)
wav_filename = os.path.basename(parsed_url.path)
local_download_path = os.path.join(args.local_download_dir, "s"+str(surah_number), "a"+str(ayah_number), wav_filename)
local_download_path = recording_utils.get_path_to_recording(local_download_dir, surah_num, ayah_num, wav_filename)

# Download and save the audio recording to the given path.
try:
with requests.Session() as s:
if args.verbose:
print("Downloading", url, "to", local_download_path)
print("Downloading", url, "to", local_download_path, ".")
download = s.get(url)
dirname = os.path.dirname(local_download_path)
if not os.path.exists(dirname):
Expand All @@ -77,80 +88,38 @@ def downloadAudio(row):
file.write(download.content)
file.close()

# Check for valid WAVE header.
try:
wf = wave.open(local_download_path, "rb")
wav_bytes = wf.readframes(wf.getnframes())
sample_rate = wf.getframerate()
num_channels = wf.getnchannels()
wf.close()

# The webrtc VAD engine only works on certain sample rates.
if args.vad_check and sample_rate in (8000, 16000, 320000, 48000) and not hasSpeech(wav_bytes, sample_rate, num_channels):
print("Audio file", local_download_path, "does not have speech according to VAD. Removing.")
os.remove(local_download_path)

except wave.Error:
print("Invalid wave header found", local_download_path, ", removing.")
os.remove(local_download_path)
except:
# move on if the download fails
pass

def hasSpeech(wav_bytes, sample_rate, num_channels):
# Use webrtc's VAD with the lowest level of aggressiveness.
mono_channel_bytes = wav_bytes

if num_channels == 2:
# just take the left channel for simplicity purposes.
# We're just trying to get a quick sanity check, no need
# to mix the two channels.
mono_channel_bytes = b"".join([wav_bytes[i:i+2] for i in range(0, len(wav_bytes), 4)])

vad = webrtcvad.Vad(1)
frame_duration = 10 # ms
bytes_per_sample = 2 # assuming 16-bit PCM.
samples_per_vaded_chunk = (sample_rate * frame_duration / 1000)
bytes_per_vaded_chunk = int(samples_per_vaded_chunk*bytes_per_sample)
num_speech_frames = 0
num_non_speech_frames = 0

for i in range(0, len(mono_channel_bytes)-bytes_per_vaded_chunk, bytes_per_vaded_chunk):
chunk_to_vad = mono_channel_bytes[i:i+bytes_per_vaded_chunk]
vad_frame_length = int(len(chunk_to_vad) / bytes_per_sample)
if (webrtcvad.valid_rate_and_frame_length(sample_rate, vad_frame_length)
and vad.is_speech(chunk_to_vad, sample_rate)):
num_speech_frames += 1
else:
num_non_speech_frames += 1

has_frames = (num_speech_frames + num_non_speech_frames > 0)
emptyAudio = (num_speech_frames == 0 or (num_speech_frames and num_non_speech_frames == 0))

if has_frames:
percentage_non_speech = (float(num_non_speech_frames) / float(num_non_speech_frames+num_speech_frames))
else:
# If there are no frames, return a default (positive > 0.5) number.
percentage_non_speech = NO_FRAMES_VALUE

if args.verbose:
print ("percentage non-speech:", percentage_non_speech,
"num_speech_frames", num_speech_frames,
"num_non_speech_frames", num_non_speech_frames)
# If the download fails, print an error and exit the function.
print("Audio file", local_download_path, "could not be opened.")

# Check if the wave header is valid and if so, get the desired info.
has_valid_header, wav_bytes, sample_rate_hz, num_channels = recording_utils.open_recording(local_download_path)

if has_valid_header:
# Note: webrtcVAD does not currently support 44.1MHz, so we have no way of checking those files for empty audio.
recording_lacks_speech = recording_utils.has_speech(
wav_bytes,
sample_rate_hz,
num_channels,
non_speech_threshold_fraction=DEFAULT_NON_SPEECH_THRESHOLD_FRACTION,
verbose=args.verbose)

if args.vad_check and sample_rate_hz in WEBRTCVAD_SUPPORTED_SAMPLE_RATES_HZ and recording_lacks_speech:
print("Audio file", local_download_path, "does not have speech according to VAD. Removing.")
os.remove(local_download_path)

return not emptyAudio and percentage_non_speech < 0.5

if __name__ == "__main__":

if not cachedCSVExists():
downloadCSVDataset()
if not cached_csv_exists():
download_csv_dataset()
else:
print("Using cached copy of csv at", args.local_csv_cache)

rows = parseCSV()
rows = parse_csv()

for row in rows:
if row[0] == "1":
downloadAudio(row)
sample_rate_hz = download_audio(row, args.local_download_dir)
# else:
# downloadAudio(row)
# download_audio(row)

0 comments on commit 48727ce

Please sign in to comment.