Add a utils file for recordings and refactor to use it.

TarteelAI · Jan 14, 2019 · 48727ce · 48727ce
1 parent a95035e
commit 48727ce
Show file tree

Hide file tree

Showing 2 changed files with 209 additions and 80 deletions.
diff --git a/audio_preprocessing/recording_utils.py b/audio_preprocessing/recording_utils.py
@@ -0,0 +1,160 @@
+"""
+A file containing utils for dealing with Tarteel audio recordings.
+
+Author: Hamzah Khan
+Date: Jan. 13, 2019
+"""
+
+import numpy as np
+import os
+import wave
+import webrtcvad
+
+DEFAULT_NON_SPEECH_THRESHOLD_FRACTION = 0.5
+ALL_SURAHS = None
+NUM_SURAHS = 114
+
+def get_paths_to_all_recordings(local_download_dir):
+    """
+    Returns a list of paths to all recordings in the given directory.
+    """
+    return get_paths_to_surah_recordings(local_download_dir)
+
+def get_paths_to_surah_recordings(local_download_dir, surahs=ALL_SURAHS):
+    """
+    Returns a list of paths, in the given directory, to recordings of ayahs in the specified surahs.
+    """
+    paths_to_audio = []
+
+    if not surahs:
+        surahs = 1 + np.arange(NUM_SURAHS)
+
+    for surah_num in surahs:
+        local_surah_dir = os.path.join(local_download_dir, "s" + str(surah_num))
+
+        for _, ayah_directories, _ in os.walk(local_surah_dir):
+            for ayah_directory in ayah_directories:
+                local_ayah_dir = os.path.join(local_surah_dir, ayah_directory)
+
+                for _, _, recording_filenames in os.walk(local_ayah_dir):
+                    for recording_filename in recording_filenames:
+                        local_audio_path = os.path.join(local_ayah_dir, recording_filename)
+
+                        # Add the fully constructed path name to the list of paths.
+                        paths_to_audio.append(local_audio_path)
+
+    return paths_to_audio
+
+def get_paths_to_ayah_recordings(local_download_dir, ayahs):
+    """
+    Returns a list of paths, in the given directory, to recordings of the specified ayahs.
+    """
+    paths_to_audio = []
+
+    if not ayahs:
+        raise Exception('Invalid list of ayahs - should contain a tuples of surah-ayah pairs.')
+
+    for surah_num, ayah_num in ayahs:
+        local_ayah_dir = os.path.join(local_download_dir, "s" + str(surah_num), "a" + str(ayah_num))
+
+        for _, _, recording_filenames in os.walk(local_ayah_dir):
+            for recording_filename in recording_filenames:
+                local_audio_path = local_audio_path = get_path_to_recording(local_download_dir,
+                                                                            surah_num,
+                                                                            ayah_num,
+                                                                            recording_filename)
+
+                # Add the fully constructed path name to the list of paths.
+                paths_to_audio.append(local_audio_path)
+
+    return paths_to_audio
+
+def get_path_to_recording(local_download_dir, surah_num, ayah_num, filename):
+    """
+    Returns the path of a single recording, given the local_download_dir, the surah and ayah numbers, and the filename.
+    """
+    local_path = os.path.join(local_download_dir, "s" + str(surah_num), "a" + str(ayah_num), filename)
+    return local_path
+
+def get_path_to_recording_by_id(local_download_dir, surah_num, ayah_num, recording_id, file_extension='wav'):
+    """
+    Returns the path of a single recording, given the local_download_dir, the surah and ayah numbers, and the recording
+    id.
+    """
+    filename = ("%d_%d_%d.%s" % (surah_num, ayah_num, recording_id, file_extension))
+    return get_path_to_recording(local_download_dir, surah_num, ayah_num, filename)
+
+def open_recording(path_to_audio):
+    """
+    Returns a tuple (True, info_tuple) the audio at the given path has a proper wave header, where info_tuple contains
+    the audio frames, sample rate in hz, and number of channels, in that order. Returns (False, None, None, None) if the
+    header is invalid.
+
+    Note: As of now, proper is defined as whether the wave library can open the file.
+    """
+    # Check for valid WAVE header.
+    try:
+        wf = wave.open(path_to_audio, "rb")
+        wav_bytes = wf.readframes(wf.getnframes())
+        sample_rate_hz = wf.getframerate()
+        num_channels = wf.getnchannels()
+        wf.close()
+        return (True, wav_bytes, sample_rate_hz, num_channels)
+
+    # If wave can not load the file, print an error and exit the function.
+    except wave.Error:
+        print("Invalid wave header found", path_to_audio, ", removing.")
+        os.remove(path_to_audio)
+        return (False, None, None, None)
+
+def has_speech(wav_bytes, 
+               sample_rate_hz, 
+               num_channels, 
+               non_speech_threshold_fraction=DEFAULT_NON_SPEECH_THRESHOLD_FRACTION,
+               verbose=False):
+    """
+    Returns true if at least (1 - non_speech_threshold_fraction) percentage of frames contain voice activity.
+    Note: webrtc VAD does not currently support 44.1MHz, so we have no way of checking those files for empty audio.
+    """
+
+    # Use webrtc's VAD with the lowest level of aggressiveness.
+    mono_channel_bytes = wav_bytes
+
+    if num_channels == 2:
+        # just take the left channel for simplicity purposes.
+        # We're just trying to get a quick sanity check, no need
+        # to mix the two channels.
+        mono_channel_bytes = b"".join([wav_bytes[i:i+2] for i in range(0, len(wav_bytes), 4)])
+
+    vad = webrtcvad.Vad(1)
+    frame_duration = 10  # ms
+    bytes_per_sample = 2 # assuming 16-bit PCM.
+    samples_per_vaded_chunk = (sample_rate_hz * frame_duration / 1000)
+    bytes_per_vaded_chunk = int(samples_per_vaded_chunk*bytes_per_sample)
+    num_speech_frames = 0
+    num_non_speech_frames = 0
+
+    for i in range(0, len(mono_channel_bytes)-bytes_per_vaded_chunk, bytes_per_vaded_chunk):
+        chunk_to_vad = mono_channel_bytes[i:i+bytes_per_vaded_chunk]
+        vad_frame_length = int(len(chunk_to_vad) / bytes_per_sample)
+        if (webrtcvad.valid_rate_and_frame_length(sample_rate_hz, vad_frame_length)
+            and vad.is_speech(chunk_to_vad, sample_rate_hz)):
+            num_speech_frames += 1
+        else:
+            num_non_speech_frames += 1
+
+    has_frames = (num_speech_frames + num_non_speech_frames > 0)
+    emptyAudio = (num_speech_frames == 0 or (num_speech_frames and num_non_speech_frames == 0))
+
+    if has_frames:
+        percentage_non_speech = (float(num_non_speech_frames) / float(num_non_speech_frames+num_speech_frames))
+    else:
+        # If there are no frames, return a default (positive > 0.5) number.
+        percentage_non_speech = NO_FRAMES_VALUE
+
+    if verbose:
+        print ("percentage non-speech:", percentage_non_speech,
+               "num_speech_frames", num_speech_frames,
+               "num_non_speech_frames", num_non_speech_frames)
+
+    return not emptyAudio and percentage_non_speech < non_speech_threshold_fraction
diff --git a/download.py b/download.py
@@ -8,15 +8,23 @@
 Modified by Hamzah Khan (khanh111) and added to the tarteel.io/Tarteel-ML on Jan. 12.
 """
 
+import audio_preprocessing.recording_utils as recording_utils
 import csv
-import requests
 import os
-import webrtcvad
+import requests
 import wave
+import webrtcvad
 
 from urllib.parse import urlparse
 from argparse import ArgumentParser
 
+# Define argument constants.
+TARTEEL_V1_CSV_URL = "https://www.tarteel.io/tarteel_v1.0.csv"
+# TODO(khanh111) Remove once tarteel.io dataset download bug is fixed.
+TARTEEL_V1_CSV_URL = "https://raw.githubusercontent.com/Tarteel-io/tarteel.io/master/audio/static/datasets/tarteel_v1.0.csv"
+TARTEEL_LIVE_CSV_URL = "https://www.tarteel.io/download-full-dataset-csv"
+
+
 parser = ArgumentParser(description='Tarteel Audio Downloader')
 parser.add_argument('--csv_url', type=str, default=TARTEEL_V1_CSV_URL)
 parser.add_argument('--local_csv_cache', type=str, default='.cache/local.csv')
@@ -26,16 +34,15 @@
 parser.add_argument('--sample', type=bool, default=True)
 args = parser.parse_args()
 
-# Define constants.
-TARTEEL_V1_CSV_URL = "https://www.tarteel.io/tarteel_v1.0.csv"
-# TODO(khanh111) Remove once tarteel.io dataset download bug is fixed.
-TARTEEL_V1_CSV_URL = "https://raw.githubusercontent.com/Tarteel-io/tarteel.io/master/audio/static/datasets/tarteel_v1.0.csv"
-TARTEEL_LIVE_CSV_URL = "https://www.tarteel.io/download-full-dataset-csv"
 
+# Define constants.
 NO_FRAMES_VALUE = 1.0
 
+DEFAULT_NON_SPEECH_THRESHOLD_FRACTION = 0.5
 
-def downloadCSVDataset():
+WEBRTCVAD_SUPPORTED_SAMPLE_RATES_HZ = [8000, 16000, 32000, 48000]
+
+def download_csv_dataset():
     print("Downloading CSV from", args.csv_url)
     with requests.Session() as s:
         download = s.get(args.csv_url)
@@ -47,28 +54,32 @@ def downloadCSVDataset():
         file.close()
         print ("Done downloading CSV.")
 
-def parseCSV():
+def parse_csv():
     file = open(args.local_csv_cache, "r")
     content = file.read()
     cr = csv.reader(content.splitlines(), delimiter=',')
     rows = list(cr)
     return rows
 
-def cachedCSVExists():
+def cached_csv_exists():
     return os.path.isfile(args.local_csv_cache)
 
-def downloadAudio(row):
-    surah_number = row[0]
-    ayah_number = row[1]
+def download_audio(row, local_download_dir):
+    """
+    Downloads an audio recording given its entry in a Tarteel dataset csv.
+    """
+    surah_num = int(row[0])
+    ayah_num = int(row[1])
     url = "http://" + row[2]
     parsed_url = urlparse(url)
     wav_filename = os.path.basename(parsed_url.path)
-    local_download_path = os.path.join(args.local_download_dir, "s"+str(surah_number), "a"+str(ayah_number), wav_filename)
+    local_download_path = recording_utils.get_path_to_recording(local_download_dir, surah_num, ayah_num, wav_filename)
 
+    # Download and save the audio recording to the given path.
     try:
         with requests.Session() as s:
             if args.verbose:
-                print("Downloading", url, "to", local_download_path)
+                print("Downloading", url, "to", local_download_path, ".")
             download = s.get(url)
             dirname = os.path.dirname(local_download_path)
             if not os.path.exists(dirname):
@@ -77,80 +88,38 @@ def downloadAudio(row):
             file.write(download.content)
             file.close()
 
-        # Check for valid WAVE header.
-        try:
-            wf = wave.open(local_download_path, "rb")
-            wav_bytes = wf.readframes(wf.getnframes())
-            sample_rate = wf.getframerate()
-            num_channels = wf.getnchannels()
-            wf.close()
-
-            # The webrtc VAD engine only works on certain sample rates.
-            if args.vad_check and sample_rate in (8000, 16000, 320000, 48000) and not hasSpeech(wav_bytes, sample_rate, num_channels):
-                print("Audio file", local_download_path, "does not have speech according to VAD. Removing.")
-                os.remove(local_download_path)
-
-        except wave.Error:
-            print("Invalid wave header found", local_download_path, ", removing.")
-            os.remove(local_download_path)
     except:
-        # move on if the download fails
-        pass
-
-def hasSpeech(wav_bytes, sample_rate, num_channels):
-    # Use webrtc's VAD with the lowest level of aggressiveness.
-    mono_channel_bytes = wav_bytes
-
-    if num_channels == 2:
-        # just take the left channel for simplicity purposes.
-        # We're just trying to get a quick sanity check, no need
-        # to mix the two channels.
-        mono_channel_bytes = b"".join([wav_bytes[i:i+2] for i in range(0, len(wav_bytes), 4)])
-
-    vad = webrtcvad.Vad(1)
-    frame_duration = 10  # ms
-    bytes_per_sample = 2 # assuming 16-bit PCM.
-    samples_per_vaded_chunk = (sample_rate * frame_duration / 1000)
-    bytes_per_vaded_chunk = int(samples_per_vaded_chunk*bytes_per_sample)
-    num_speech_frames = 0
-    num_non_speech_frames = 0
-
-    for i in range(0, len(mono_channel_bytes)-bytes_per_vaded_chunk, bytes_per_vaded_chunk):
-        chunk_to_vad = mono_channel_bytes[i:i+bytes_per_vaded_chunk]
-        vad_frame_length = int(len(chunk_to_vad) / bytes_per_sample)
-        if (webrtcvad.valid_rate_and_frame_length(sample_rate, vad_frame_length)
-            and vad.is_speech(chunk_to_vad, sample_rate)):
-            num_speech_frames += 1
-        else:
-            num_non_speech_frames += 1
-
-    has_frames = (num_speech_frames + num_non_speech_frames > 0)
-    emptyAudio = (num_speech_frames == 0 or (num_speech_frames and num_non_speech_frames == 0))
-
-    if has_frames:
-        percentage_non_speech = (float(num_non_speech_frames) / float(num_non_speech_frames+num_speech_frames))
-    else:
-        # If there are no frames, return a default (positive > 0.5) number.
-        percentage_non_speech = NO_FRAMES_VALUE
-
-    if args.verbose:
-        print ("percentage non-speech:", percentage_non_speech,
-               "num_speech_frames", num_speech_frames,
-               "num_non_speech_frames", num_non_speech_frames)
+        # If the download fails, print an error and exit the function.
+        print("Audio file", local_download_path, "could not be opened.")
+
+    # Check if the wave header is valid and if so, get the desired info.
+    has_valid_header, wav_bytes, sample_rate_hz, num_channels = recording_utils.open_recording(local_download_path)
+
+    if has_valid_header:
+        # Note: webrtcVAD does not currently support 44.1MHz, so we have no way of checking those files for empty audio.
+        recording_lacks_speech = recording_utils.has_speech(
+            wav_bytes, 
+            sample_rate_hz, 
+            num_channels, 
+            non_speech_threshold_fraction=DEFAULT_NON_SPEECH_THRESHOLD_FRACTION, 
+            verbose=args.verbose)
+
+        if args.vad_check and sample_rate_hz in WEBRTCVAD_SUPPORTED_SAMPLE_RATES_HZ and recording_lacks_speech:
+            print("Audio file", local_download_path, "does not have speech according to VAD. Removing.")
+            os.remove(local_download_path)
 
-    return not emptyAudio and percentage_non_speech < 0.5
 
 if __name__ == "__main__":
 
-    if not cachedCSVExists():
-        downloadCSVDataset()
+    if not cached_csv_exists():
+        download_csv_dataset()
     else:
         print("Using cached copy of csv at", args.local_csv_cache)
 
-    rows = parseCSV()
+    rows = parse_csv()
 
     for row in rows:
         if row[0] == "1":
-            downloadAudio(row)
+            sample_rate_hz = download_audio(row, args.local_download_dir)
         # else:
-        #     downloadAudio(row)
+        #     download_audio(row)