Skip to content

Commit

Permalink
Added some scripts and functions to assist with collection false acti…
Browse files Browse the repository at this point in the history
…vations
  • Loading branch information
dscripka committed Jan 23, 2023
1 parent ac35230 commit 2edd4f5
Show file tree
Hide file tree
Showing 5 changed files with 240 additions and 2 deletions.
20 changes: 19 additions & 1 deletion examples/capture_activations.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,21 @@
default=0.5,
required=False
)
parser.add_argument(
"--vad_threshold",
help="""The threshold to use for voice activity detection (VAD) in the openWakeWord instance.
The default (0.0), disables VAD.""",
type=float,
default=0.0,
required=False
)
parser.add_argument(
"--noise_suppression",
help="Whether to enable speex noise suppression in the openWakeWord instance.",
type=bool,
default=False,
required=False
)
args=parser.parse_args()

# Get microphone stream
Expand All @@ -62,7 +77,10 @@
mic_stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)

# Load pre-trained openwakeword models
owwModel = Model()
owwModel = Model(
enable_speex_noise_suppression=args.noise_suppression,
vad_threshold = args.vad_threshold
)

# Set waiting period after activation before saving clip (to get some audio context after the activation)
save_delay = 1 # seconds
Expand Down
145 changes: 145 additions & 0 deletions examples/mine_false_positives.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
# Copyright 2022 David Scripka. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Imports
import numpy as np
import os
import scipy.io.wavfile
import tempfile
import openwakeword
import argparse
import time
from speechbrain.dataio.dataio import read_audio
import collections
from tqdm import tqdm

# Parse input arguments
parser=argparse.ArgumentParser()
parser.add_argument(
"--input_files",
help="""A text file where each line is a full path to an audio file to mine for false-positives.""",
type=str,
default="./",
required=True
)
parser.add_argument(
"--skip_files",
help="""A text file where each line is a full path to an audio file that should be skipped.""",
type=str,
required=False
)
parser.add_argument(
"--output_dir",
help="""Where to save the audio features from a false-positive.
By default, will be saved as <model_name>.npy files of shape N_clips x frames x features""",
type=str,
default="./",
required=True
)
parser.add_argument(
"--n_threads",
help="""The number of CPU threads to use when processing.""",
type=int,
default=1,
required=False
)
parser.add_argument(
"--max_wall_time",
help="""The total amount of wall-clock time (in hours) to mine for false-positives. When this limit is reached
the examples found up to this point will be saved.""",
type=float,
default=1,
required=False
)
parser.add_argument(
"--max_feature_size",
help="""The maximum size (in MB) for the false-positive features. If the total collected is larger
is than this, processing will stop.""",
type=float,
default=5000,
required=False
)
args=parser.parse_args()

if __name__ == "__main__":
# Get audio files to mine from input list
with open(args.input_files, 'r') as f:
input_files = [i.strip() for i in f.readlines()]

# Get audio files to skip and adjust input file list
if args.skip_files:
with open(args.skip_files, 'r') as f:
skip_files = [i.strip() for i in f.readlines()]
input_files = [i for i in input_files if i not in skip_files]

# Set starting time
start_time = time.time()

# Begin processing files
bs = int(args.n_threads*2)
combined_features = collections.defaultdict(list)
for i in tqdm(range(0, len(input_files), bs)):
with tempfile.TemporaryDirectory() as tmp_dir:
batch = input_files[i:i+bs]
batch_data = []
tmp_file_paths = []
for i in batch:
dat = read_audio(i).numpy()
if len(dat.shape) > 1:
dat = dat[:, 0]
dat = (dat*32767).astype(np.int16) # convert to 16-khz, 16-bit audio

# Save audio to temporary .wav files
tmp_fname = os.path.join(tmp_dir, i.split(os.path.sep)[-1])
scipy.io.wavfile.write(tmp_fname, 16000, dat)
tmp_file_paths.append(tmp_fname)

# Predict on temporary files
predictions = openwakeword.utils.bulk_predict(
file_paths=tmp_file_paths,
wakeword_model_paths=[], # loads all default models
prediction_function="_get_positive_prediction_frames",
ncpu=args.n_threads
)

# Combine and store features
for fl in predictions.keys():
for lbl in predictions[fl].keys():
combined_features[lbl].append(predictions[fl][lbl])

# Check for maximum processing time
if (time.time() - start_time)/3600 > args.max_wall_time:
print("\nMaximum wall-time reached. Saving mined false-positives and exiting...")
break

# Check for maximum features size in memory
size = 0
for key in combined_features.keys():
for i in combined_features[key]:
size += i.nbytes/1e6
if size > args.max_feature_size:
print("\nMaximum feature size (in MB) reached. Saving mined false-positives and exiting...")
break

# Combine mined features into single numpy arrays
for lbl in combined_features.keys():
combined_features[lbl] = np.concatenate(combined_features[lbl], axis=0)

# Save results to .npy files
if not os.path.exists(args.output_dir):
os.mkdir(args.output_dir)

for key in combined_features.keys():
np.save(f"{args.output_dir}{os.path.sep}{key}.npy", combined_features[key])

34 changes: 34 additions & 0 deletions openwakeword/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,40 @@ def estimate_clip_duration(audio_files: list, sizes: list):
return durations


def estimate_mp3_duration(fpath):
"""Estimates the duration of an MP3 file from metadata and file-size.
Is only accurate for 16000 khz sample rate audio with a relatively
constant bit-rate.
Args:
fpath (str): The input path to the MP3 file
Returns:
float: The duration of the MP3 file in seconds
"""

conversion_factors ={
"16_khz_single_channel": 0.000333318208471784,
"16_khz_stereo": 0.000333318208471784/2
}

duration_seconds = 0
try:
md = torchaudio.info(fpath)
except RuntimeError:
return duration_seconds

nbytes = os.path.getsize(fpath)
if md.num_channels == 1:
if md.sample_rate == 16000:
duration_seconds = nbytes*conversion_factors["16_khz_single_channel"]
elif md.num_channels == 2:
if md.sample_rate == 16000:
duration_seconds = nbytes*conversion_factors["16_khz_stereo"]

return duration_seconds


def get_clip_duration(clip):
"""Gets the duration of an audio clip in seconds from file header information"""
try:
Expand Down
38 changes: 38 additions & 0 deletions openwakeword/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,44 @@ def predict_clip(self, clip: Union[str, np.ndarray], padding: int = 1, **kwargs)

return predictions

def _get_positive_prediction_frames(self, file: str, threshold: float = 0.5, **kwargs):
"""
Gets predictions for the input audio data, and returns the audio features (embeddings)
for all of the frames with a score above the `threshold` argument. Can be a useful
way to collect false-positive predictions.
Args:
file (str): The path to a 16-bit 16khz WAV audio file to process
threshold (float): The minimum score required for a frame of audio features
to be returned.
kwargs: Any keyword arguments to pass to the class `predict` method
Returns:
dict: A dictionary with filenames as keys and N x M arrays as values,
where N is the number of examples and M is the number
of audio features, depending on the model input shape.
"""
# Load audio clip as 16-bit PCM data
with wave.open(file, mode='rb') as f:
# Load WAV clip frames
data = np.frombuffer(f.readframes(f.getnframes()), dtype=np.int16)

# Iterate through clip, getting predictions
positive_features = defaultdict(list)
step_size = 1280
for i in range(0, data.shape[0]-step_size, step_size):
predictions = self.predict(data[i:i+step_size], **kwargs)
for lbl in predictions.keys():
if predictions[lbl] >= threshold:
mdl = self.get_parent_model_from_label(lbl)
features = self.preprocessor.get_features(self.model_inputs[mdl])
positive_features[lbl].append(features)

for lbl in positive_features.keys():
positive_features[lbl] = np.vstack(positive_features[lbl])

return positive_features

def _suppress_noise_with_speex(self, x: np.ndarray, frame_size: int = 160):
"""
Runs the input audio through the SpeexDSP noise suppression algorithm.
Expand Down
5 changes: 4 additions & 1 deletion openwakeword/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,7 @@ def __call__(self, x):
def bulk_predict(
file_paths: List[str],
wakeword_model_paths: List[str],
prediction_function: str = 'predict_clip',
ncpu: int = 1,
**kwargs
):
Expand All @@ -324,6 +325,8 @@ def bulk_predict(
Args:
input_paths (List[str]): The list of input file to predict
wakeword_model_path (List[str])): The paths to the wakeword ONNX model files
prediction_function (str): The name of the method used to predict on the input audio files
(default is the `predict_clip` method)
ncpu (int): How many processes to create (up to max of available CPUs)
kwargs (dict): Any other keyword arguments to pass to the model prediction function (`predict_clip`)
Expand Down Expand Up @@ -351,7 +354,7 @@ def bulk_predict(
def f(clips):
results = []
for clip in clips:
results.append({clip: mdls[-1].predict_clip(clip, **kwargs)})
results.append({clip: getattr(mdls[-1], prediction_function)(clip, **kwargs)})
q.put(results)

ps.append(Process(target=f, args=(chunk,)))
Expand Down

0 comments on commit 2edd4f5

Please sign in to comment.