forked from dscripka/openWakeWord
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcustom_model.yml
101 lines (78 loc) · 5.25 KB
/
custom_model.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
## Configuration file to be used with `train.py` to create custom wake word/phrase models
# The name of the model (will be used when creating directoires and when saving the final .onnx and .tflite files)
model_name: "my_model"
# The target word/phrase to be detected by the model. Adding multiple unique words/phrases will
# still only train a binary model detection model, but it will activate on any one of the provided words/phrases.
target_phrase:
- "hey jarvis"
# Specific phrases that you do *not* want the model to activate on, outside of those generated automatically via phoneme overlap
# This can be a good way to reduce false positives if you notice that, in practice, certain words or phrases are problematic
custom_negative_phrases: []
# The total number of positive samples to generate for training (minimum of 20,000 recommended, often 100,000+ is best)
n_samples: 10000
# The total number of positive samples to generate for validation and early stopping of model training
n_samples_val: 2000
# The batch size to use with Piper TTS when generating synthetic training data
tts_batch_size: 50
# The batch size to use when performing data augmentation on generated clips prior to training
# It's recommended that this not be too large to ensure that there is enough variety in the augmentation
augmentation_batch_size: 16
# The path to a fork of the piper-sample-generator repository for TTS (https://github.com/dscripka/piper-sample-generator)
piper_sample_generator_path: "./piper-sample-generator"
# The output directory for the generated synthetic clips, openwakeword features, and trained models
# Sub-directories will be automatically created for train and test clips for both positive and negative examples
output_dir: "./my_custom_model"
# The directories containing Room Impulse Response recordings
rir_paths:
- "./mit_rirs"
# The directories containing background audio files to mix with training data
background_paths:
- "./background_clips"
# The duplication rate for the background audio clips listed above (1 or higher). Can be useful as a way to oversample
# a particular type of background noise more relevant to a given deployment environment. Values apply in the same
# order as the background_paths list above. Only useful when multiple directories are provided above.
background_paths_duplication_rate:
- 1
# The location of pre-computed openwakeword features for false-positive validation data
# If you do not have deployment environment validation data, a good general purpose dataset with
# a reasonable mix with ~11 hours of speech, noise, and music is available here: https://huggingface.co/datasets/davidscripka/openwakeword_features
false_positive_validation_data_path: "./validation_set_features.npy"
# The number of times to apply augmentations to the generated training data
# Values greater than 1 reuse each generation that many times, producing overall unique
# clips for training due to the randomness intrinsic to the augmentation despite using
# the same original synthetic generation. Can be a useful way to increase model robustness
# without having to generate extremely large numbers of synthetic examples.
augmentation_rounds: 1
# Paths to pre-computed openwakeword features for positive and negative data. Each file must be a saved
# .npy array (see the example notebook on manually training new models for details on how to create these).
# There is no limit on the number of files but training speed will decrease as more
# data will need to be read from disk for each additional file.
# Also, there is a custom dataloader that uses memory-mapping with loading data, so the total size
# of the files is not limited by the amount of available system memory (though this will result
# in decreased training throughput depending on the speed of the underlying storage device). A fast
# NVME SSD is recommended for optimal performance.
feature_data_files:
"ACAV100M_sample": "./openwakeword_features_ACAV100M_2000_hrs_16bit.npy"
# Define the number of examples from each data file per batch. Note that the key names here
# must correspond to those define in the `feature_data_files` dictionary above (except for
# the `positive` and `adversarial_negative` keys, which are automatically defined). The sum
# of the values for each key define the total batch size for training. Initial testing indicates
# that batch sizes of 1024-4096 work well in practice.
batch_n_per_class:
"ACAV100M_sample": 1024
"adversarial_negative": 50
"positive": 50
# Define the type of size of the openwakeword model to train. Increasing the layer size
# may result in a more capable model, at the cost of decreased inference speed. The default
# value (32) seems to work well in practice for most wake words/phrases.
model_type: "dnn"
layer_size: 32
# Define training parameters. The values below are recommended defaults for most applications,
# but unique deployment environments will likely require testing to determine which values
# are the most appropriate.
# The maximum number of steps to train the model
steps: 50000
# The maximum negative weight and target false positives per hour, used to control the auto training process
# The target false positive rate may not be achieved, and adjusting the maximum negative weight may be necessary
max_negative_weight: 1500
target_false_positives_per_hour: 0.2