Added some scripts and functions to assist with collection false acti…

…vations
AnkushMalaker · Jan 23, 2023 · 2edd4f5 · 2edd4f5
1 parent ac35230
commit 2edd4f5
Show file tree

Hide file tree

Showing 5 changed files with 240 additions and 2 deletions.
diff --git a/examples/capture_activations.py b/examples/capture_activations.py
@@ -51,6 +51,21 @@
     default=0.5,
     required=False
 )
+parser.add_argument(
+    "--vad_threshold",
+    help="""The threshold to use for voice activity detection (VAD) in the openWakeWord instance.
+            The default (0.0), disables VAD.""",
+    type=float,
+    default=0.0,
+    required=False
+)
+parser.add_argument(
+    "--noise_suppression",
+    help="Whether to enable speex noise suppression in the openWakeWord instance.",
+    type=bool,
+    default=False,
+    required=False
+)
 args=parser.parse_args()
 
 # Get microphone stream
@@ -62,7 +77,10 @@
 mic_stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
 
 # Load pre-trained openwakeword models
-owwModel = Model()
+owwModel = Model(
+    enable_speex_noise_suppression=args.noise_suppression,
+    vad_threshold = args.vad_threshold
+)
 
 # Set waiting period after activation before saving clip (to get some audio context after the activation)
 save_delay = 1  # seconds

diff --git a/examples/mine_false_positives.py b/examples/mine_false_positives.py
@@ -0,0 +1,145 @@
+# Copyright 2022 David Scripka. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Imports
+import numpy as np
+import os
+import scipy.io.wavfile
+import tempfile
+import openwakeword
+import argparse
+import time
+from speechbrain.dataio.dataio import read_audio
+import collections
+from tqdm import tqdm
+
+# Parse input arguments
+parser=argparse.ArgumentParser()
+parser.add_argument(
+    "--input_files",
+    help="""A text file where each line is a full path to an audio file to mine for false-positives.""",
+    type=str,
+    default="./",
+    required=True
+)
+parser.add_argument(
+    "--skip_files",
+    help="""A text file where each line is a full path to an audio file that should be skipped.""",
+    type=str,
+    required=False
+)
+parser.add_argument(
+    "--output_dir",
+    help="""Where to save the audio features from a false-positive.
+          By default, will be saved as <model_name>.npy files of shape N_clips x frames x features""",
+    type=str,
+    default="./",
+    required=True
+)
+parser.add_argument(
+    "--n_threads",
+    help="""The number of CPU threads to use when processing.""",
+    type=int,
+    default=1,
+    required=False
+)
+parser.add_argument(
+    "--max_wall_time",
+    help="""The total amount of wall-clock time (in hours) to mine for false-positives. When this limit is reached
+            the examples found up to this point will be saved.""",
+    type=float,
+    default=1,
+    required=False
+)
+parser.add_argument(
+    "--max_feature_size",
+    help="""The maximum size (in MB) for the false-positive features. If the total collected is larger
+            is than this, processing will stop.""",
+    type=float,
+    default=5000,
+    required=False
+)
+args=parser.parse_args()
+
+if __name__ == "__main__":
+    # Get audio files to mine from input list
+    with open(args.input_files, 'r') as f:
+        input_files = [i.strip() for i in f.readlines()]
+
+    # Get audio files to skip and adjust input file list
+    if args.skip_files:
+        with open(args.skip_files, 'r') as f:
+            skip_files = [i.strip() for i in f.readlines()]
+        input_files = [i for i in input_files if i not in skip_files]
+
+    # Set starting time
+    start_time = time.time()
+
+    # Begin processing files
+    bs = int(args.n_threads*2)
+    combined_features = collections.defaultdict(list)
+    for i in tqdm(range(0, len(input_files), bs)):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            batch = input_files[i:i+bs]
+            batch_data = []
+            tmp_file_paths = []
+            for i in batch:
+                dat = read_audio(i).numpy()
+                if len(dat.shape) > 1:
+                    dat = dat[:, 0]
+                dat = (dat*32767).astype(np.int16) # convert to 16-khz, 16-bit audio
+
+                # Save audio to temporary .wav files
+                tmp_fname = os.path.join(tmp_dir, i.split(os.path.sep)[-1])
+                scipy.io.wavfile.write(tmp_fname, 16000, dat)
+                tmp_file_paths.append(tmp_fname)
+
+            # Predict on temporary files
+            predictions = openwakeword.utils.bulk_predict(
+                file_paths=tmp_file_paths,
+                wakeword_model_paths=[], # loads all default models
+                prediction_function="_get_positive_prediction_frames",
+                ncpu=args.n_threads
+            )
+
+            # Combine and store features
+            for fl in predictions.keys():
+                for lbl in predictions[fl].keys():
+                    combined_features[lbl].append(predictions[fl][lbl])
+
+            # Check for maximum processing time       
+            if (time.time() - start_time)/3600 > args.max_wall_time:
+                print("\nMaximum wall-time reached. Saving mined false-positives and exiting...")
+                break
+
+            # Check for maximum features size in memory
+            size = 0
+            for key in combined_features.keys():
+                for i in combined_features[key]:
+                    size += i.nbytes/1e6
+            if size > args.max_feature_size:
+                print("\nMaximum feature size (in MB) reached. Saving mined false-positives and exiting...")
+                break
+
+    # Combine mined features into single numpy arrays
+    for lbl in combined_features.keys():
+        combined_features[lbl] = np.concatenate(combined_features[lbl], axis=0)
+
+    # Save results to .npy files
+    if not os.path.exists(args.output_dir):
+        os.mkdir(args.output_dir)
+
+    for key in combined_features.keys():
+        np.save(f"{args.output_dir}{os.path.sep}{key}.npy", combined_features[key])
+
diff --git a/openwakeword/data.py b/openwakeword/data.py
@@ -225,6 +225,40 @@ def estimate_clip_duration(audio_files: list, sizes: list):
     return durations
 
 
+def estimate_mp3_duration(fpath):
+    """Estimates the duration of an MP3 file from metadata and file-size.
+    Is only accurate for 16000 khz sample rate audio with a relatively
+    constant bit-rate.
+    
+    Args:
+        fpath (str): The input path to the MP3 file
+        
+    Returns:
+        float: The duration of the MP3 file in seconds
+    """
+
+    conversion_factors ={
+        "16_khz_single_channel": 0.000333318208471784,
+        "16_khz_stereo": 0.000333318208471784/2
+    }
+
+    duration_seconds = 0
+    try:
+        md = torchaudio.info(fpath)
+    except RuntimeError:
+        return duration_seconds
+
+    nbytes = os.path.getsize(fpath)
+    if md.num_channels == 1:
+        if md.sample_rate == 16000:
+            duration_seconds = nbytes*conversion_factors["16_khz_single_channel"]
+    elif md.num_channels == 2:
+        if md.sample_rate == 16000:
+            duration_seconds = nbytes*conversion_factors["16_khz_stereo"]
+
+    return duration_seconds
+
+
 def get_clip_duration(clip):
     """Gets the duration of an audio clip in seconds from file header information"""
     try:

diff --git a/openwakeword/model.py b/openwakeword/model.py
@@ -266,6 +266,44 @@ def predict_clip(self, clip: Union[str, np.ndarray], padding: int = 1, **kwargs)
 
         return predictions
 
+    def _get_positive_prediction_frames(self, file: str, threshold: float = 0.5, **kwargs):
+        """
+        Gets predictions for the input audio data, and returns the audio features (embeddings)
+        for all of the frames with a score above the `threshold` argument. Can be a useful
+        way to collect false-positive predictions.
+
+        Args:
+            file (str): The path to a 16-bit 16khz WAV audio file to process
+            threshold (float): The minimum score required for a frame of audio features
+                               to be returned.
+            kwargs: Any keyword arguments to pass to the class `predict` method
+            
+        Returns:
+            dict: A dictionary with filenames as keys and  N x M arrays as values,
+                  where N is the number of examples and M is the number
+                  of audio features, depending on the model input shape.
+        """
+        # Load audio clip as 16-bit PCM data
+        with wave.open(file, mode='rb') as f:
+            # Load WAV clip frames
+            data = np.frombuffer(f.readframes(f.getnframes()), dtype=np.int16)
+
+        # Iterate through clip, getting predictions
+        positive_features = defaultdict(list)
+        step_size = 1280
+        for i in range(0, data.shape[0]-step_size, step_size):
+            predictions = self.predict(data[i:i+step_size], **kwargs)
+            for lbl in predictions.keys():
+                if predictions[lbl] >= threshold:
+                    mdl = self.get_parent_model_from_label(lbl)
+                    features = self.preprocessor.get_features(self.model_inputs[mdl])
+                    positive_features[lbl].append(features)
+
+        for lbl in positive_features.keys():
+            positive_features[lbl] = np.vstack(positive_features[lbl])
+
+        return positive_features
+
     def _suppress_noise_with_speex(self, x: np.ndarray, frame_size: int = 160):
         """
         Runs the input audio through the SpeexDSP noise suppression algorithm.

diff --git a/openwakeword/utils.py b/openwakeword/utils.py
@@ -315,6 +315,7 @@ def __call__(self, x):
 def bulk_predict(
                  file_paths: List[str],
                  wakeword_model_paths: List[str],
+                 prediction_function: str = 'predict_clip',
                  ncpu: int = 1,
                  **kwargs
                  ):
@@ -324,6 +325,8 @@ def bulk_predict(
     Args:
         input_paths (List[str]): The list of input file to predict
         wakeword_model_path (List[str])): The paths to the wakeword ONNX model files
+        prediction_function (str): The name of the method used to predict on the input audio files
+                                   (default is the `predict_clip` method)
         ncpu (int): How many processes to create (up to max of available CPUs)
         kwargs (dict): Any other keyword arguments to pass to the model prediction function (`predict_clip`)
 
@@ -351,7 +354,7 @@ def bulk_predict(
         def f(clips):
             results = []
             for clip in clips:
-                results.append({clip: mdls[-1].predict_clip(clip, **kwargs)})
+                results.append({clip: getattr(mdls[-1], prediction_function)(clip, **kwargs)})
             q.put(results)
 
         ps.append(Process(target=f, args=(chunk,)))