examples/custom_model.yml

## Configuration file to be used with `train.py` to create custom wake word/phrase models

# The name of the model (will be used when creating directoires and when saving the final .onnx and .tflite files)
model_name: "my_model"

# The target word/phrase to be detected by the model. Adding multiple unique words/phrases will
# still only train a binary model detection model, but it will activate on any one of the provided words/phrases.
target_phrase:
  - "hey jarvis"

# Specific phrases that you do *not* want the model to activate on, outside of those generated automatically via phoneme overlap
# This can be a good way to reduce false positives if you notice that, in practice, certain words or phrases are problematic
custom_negative_phrases: []

# The total number of positive samples to generate for training (minimum of 20,000 recommended, often 100,000+ is best)
n_samples: 10000

# The total number of positive samples to generate for validation and early stopping of model training
n_samples_val: 2000

# The batch size to use with Piper TTS when generating synthetic training data
tts_batch_size: 50

# The batch size to use when performing data augmentation on generated clips prior to training
# It's recommended that this not be too large to ensure that there is enough variety in the augmentation
augmentation_batch_size: 16

# The path to a fork of the piper-sample-generator repository for TTS (https://github.com/dscripka/piper-sample-generator)
piper_sample_generator_path: "./piper-sample-generator"

# The output directory for the generated synthetic clips, openwakeword features, and trained models
# Sub-directories will be automatically created for train and test clips for both positive and negative examples
output_dir: "./my_custom_model"

# The directories containing Room Impulse Response recordings
rir_paths:
  - "./mit_rirs"

# The directories containing background audio files to mix with training data
background_paths:
  - "./background_clips"

# The duplication rate for the background audio clips listed above (1 or higher). Can be useful as a way to oversample
# a particular type of background noise more relevant to a given deployment environment. Values apply in the same
# order as the background_paths list above. Only useful when multiple directories are provided above.
background_paths_duplication_rate:
  - 1

# The location of pre-computed openwakeword features for false-positive validation data
# If you do not have deployment environment validation data, a good general purpose dataset with
# a reasonable mix with ~11 hours of speech, noise, and music is available here: https://huggingface.co/datasets/davidscripka/openwakeword_features
false_positive_validation_data_path: "./validation_set_features.npy"

# The number of times to apply augmentations to the generated training data
# Values greater than 1 reuse each generation that many times, producing overall unique
# clips for training due to the randomness intrinsic to the augmentation despite using
# the same original synthetic generation. Can be a useful way to increase model robustness
# without having to generate extremely large numbers of synthetic examples.
augmentation_rounds: 1

# Paths to pre-computed openwakeword features for positive and negative data. Each file must be a saved
# .npy array (see the example notebook on manually training new models for details on how to create these).
# There is no limit on the number of files but training speed will decrease as more
# data will need to be read from disk for each additional file.
# Also, there is a custom dataloader that uses memory-mapping with loading data, so the total size
# of the files is not limited by the amount of available system memory (though this will result
# in decreased training throughput depending on the speed of the underlying storage device). A fast
# NVME SSD is recommended for optimal performance.

feature_data_files:
  "ACAV100M_sample": "./openwakeword_features_ACAV100M_2000_hrs_16bit.npy"

# Define the number of examples from each data file per batch. Note that the key names here
# must correspond to those define in the `feature_data_files` dictionary above (except for
# the `positive` and `adversarial_negative` keys, which are automatically defined). The sum
# of the values for each key define the total batch size for training. Initial testing indicates
# that batch sizes of 1024-4096 work well in practice.

batch_n_per_class:
  "ACAV100M_sample": 1024
  "adversarial_negative": 50
  "positive": 50

# Define the type of size of the openwakeword model to train. Increasing the layer size
# may result in a more capable model, at the cost of decreased inference speed. The default
# value (32) seems to work well in practice for most wake words/phrases.

model_type: "dnn"
layer_size: 32

# Define training parameters. The values below are recommended defaults for most applications,
# but unique deployment environments will likely require testing to determine which values
# are the most appropriate.

# The maximum number of steps to train the model
steps: 50000

# The maximum negative weight and target false positives per hour, used to control the auto training process
# The target false positive rate may not be achieved, and adjusting the maximum negative weight may be necessary
max_negative_weight: 1500
target_false_positives_per_hour: 0.2