-
-
Notifications
You must be signed in to change notification settings - Fork 31
/
settings.py
329 lines (282 loc) · 20.2 KB
/
settings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
# noinspection PyPackageRequirements
import yaml
import os
from pathlib import Path
from click import core
import threading
import Utilities
DEFAULT_SETTINGS_PATH = Path(Path.cwd() / "Profiles" / 'settings.yaml')
DEBOUNCE_TIME = 1.5 # 1.5 second, adjust as necessary
NON_PERSISTENT_SETTINGS = [
"stt_enabled",
"whisper_languages", "lang_swap", "verbose",
"transl_result_textarea_savetts_voice", "transl_result_textarea_sendtts_download",
"plugin_timer_stopped", "plugin_current_timer", "websocket_final_messages",
"device_default_in_index", "device_default_out_index", "ui_download",
"audio_processor_caller",
]
class SettingsManager:
def __init__(self, immutable=False):
self.settings_path = None
self.immutable = immutable
self.translate_settings = {
"process_id": 0, # the process id of the running instance
# text translate settings
"txt_translate": False, # if enabled, pipes whisper A.I. results through text translator
"txt_translator_device": "cpu", # auto, cuda, cpu
"src_lang": "auto", # source language for text translator (Whisper A.I. in translation mode always translates to "en")
"trg_lang": "fra_Latn", # target language for text translator
"txt_romaji": False, # if enabled, text translator will convert text to romaji.
"txt_translator": "NLLB200_CT2", # can be "NLLB200", "NLLB200_CT2" or "M2M100"
"txt_translator_size": "small", # for M2M100 model size: Can be "small" or "large", for NLLB200 model size: Can be "small", "medium", "large".
"txt_translator_precision": "float32", # for ctranwslate based: can be "default", "auto", "int8", "int8_float16", "int16", "float16", "float32".
"txt_translate_realtime": False, # use text translator in realtime mode
"txt_second_translation_enabled": False, # translate to more languages
"txt_second_translation_languages": "eng_Latn", # comma separated list of languages for further translations
"txt_second_translation_wrap": " | ", # wrap other translations in result with this string
# ocr settings
"ocr_lang": "en", # language for OCR image to text recognition.
"ocr_window_name": "VRChat", # window name for OCR image to text recognition.
"ocr_txt_src_lang": "auto", # (internal setting for ocr text translation)
"ocr_txt_trg_lang": "eng_Latn", # (internal setting for ocr text translation)
# audio settings
"audio_api": "MME", # The name of the audio API. (MME, DirectSound, WASAPI)
"audio_input_device": "", # audio input device name - used by whispering tiger UI to select audio input device by name
"audio_output_device": "", # audio output device name - used by whispering tiger UI to select audio output device by name
"device_index": None, # input device index for STT
"device_out_index": None, # output device index for TTS
# whisper settings
"stt_enabled": True, # enable STT (if disabled, stops sending audio to whisper)
"ai_device": None, # can be None (auto), "cuda" or "cpu".
"whisper_task": "transcribe", # Whisper A.I. Can do "transcribe" or "translate"
"current_language": None, # can be None (auto) or any Whisper supported language.
"target_language": "eng", # can be any M4T supported language.
"model": "small", # Whisper model size. Can be "tiny", "base", "small", "medium" or "large"
"condition_on_previous_text": False, # if enabled, Whisper will condition on previous text. (more prone to loops or getting stuck)
"prompt_reset_on_temperature": 0.5, # after which temperature fallback step the prompt with the previous text should be reset (default value is 0.5)
"energy": 300, # energy of audio volume to start whisper processing. Can be 0-1000
"phrase_time_limit": 0, # time limit for Whisper to generate a phrase. (0 = no limit)
"pause": 1.0, # pause between phrases.
"initial_prompt": "", # initial prompt for Whisper. for example "Umm, let me think like, hmm... Okay, here's what I'm, like, thinking." will give more filler words.
"logprob_threshold": "-1.0",
"no_speech_threshold": "0.6",
"length_penalty": 1.0,
"beam_search_patience": 1.0,
"repetition_penalty": 1.0, # penalize the score of previously generated tokens (set > 1 to penalize)
"no_repeat_ngram_size": 0, # prevent repetitions of ngrams with this size
"whisper_precision": "float32", # for original Whisper can be "float16" or "float32", for faster-whisper "default", "auto", "int8", "int8_float16", "int16", "float16", "float32".
"stt_type": "faster_whisper", # can be "faster_whisper", "original_whisper", "transformer_whisper", "speech_t5", "seamless_m4t" etc.
"temperature_fallback": True, # Set to False to disable temperature fallback which is the reason for some slowdowns, but decreases quality.
"beam_size": 5, # Beam size for beam search. (higher = more accurate, but slower)
"whisper_cpu_threads": 0, # Number of threads to use when running on CPU (4 by default)
"whisper_num_workers": 1, # When transcribe() is called from multiple Python threads
"vad_enabled": True, # Enable Voice activity detection (VAD)
"vad_on_full_clip": False, # Make an additional VAD check on the full clip (Not only on each frame).
"vad_confidence_threshold": 0.4, # Voice activity detection (VAD) confidence threshold. Can be 0-1
"vad_frames_per_buffer": 512, # Voice activity detection (VAD) sample size (how many audio samples should be tested). Values other than 512, 256 are not supported. (default: 512)
"vad_thread_num": 1, # number of threads to use for VAD.
"push_to_talk_key": "", # Push to talk key. (empty or None to disable)
"word_timestamps": False, # if enabled, Whisper will add timestamps to the transcribed text.
"faster_without_timestamps": False, # if enabled, faster whisper will only sample text tokens. (only when using stt_type=faster_whisper)
"whisper_apply_voice_markers": False, # if enabled, Whisper will apply voice markers.
"max_sentence_repetition": -1, # set max sentence repetition in result (-1 = disabled)
"transcription_auto_save_file": "", # set to filepath to save transcriptions. (empty or None to disable)
"transcription_auto_save_continous_text": False, # set to save continous text line instead of CSV
"transcription_save_audio_dir": "", # set to filepath to save transcriptions wav files. (empty or None to disable)
"silence_cutting_enabled": True,
"silence_offset": -40.0,
"max_silence_length": 30.0,
"keep_silence_length": 0.20,
"normalize_enabled": True,
"normalize_lower_threshold": -24.0,
"normalize_upper_threshold": -16.0,
"normalize_gain_factor": 2.0,
"denoise_audio": "", # if enabled, audio will be de-noised before processing. (Can be empty, "deepfilter" or "noise_reduce")
"denoise_audio_before_trigger": False, # if enabled, noise cancellation will be applied on the audio chunks before recording trigger conditions are detected.
"denoise_audio_post_filter": False, # Enable post filter for some minor, extra noise reduction.
"thread_per_transcription": True, # Use a separate thread for each transcription.
"speaker_diarization": False, # Enable speaker diarization.
"speaker_change_split": True, # Split audio at speaker changes. (when speaker diarization is enabled)
"min_speaker_length": 0.5, # minimum length a speaker should talk. (to reduce speaker change fluctuations)
"min_speakers": 1, # minimum amount of detected speakers
"max_speakers": 3, # maximum amount of detected speakers
"realtime": False, # if enabled, Whisper will process audio in realtime.
"realtime_whisper_model": "", # model used for realtime transcription. (empty for using same model as model setting)
"realtime_whisper_precision": "float16", # precision used for realtime transcription model. (only used when realtime_whisper_model is set)
"realtime_whisper_beam_size": 1, # beam size used for realtime transcription model.
"realtime_temperature_fallback": False, # Set to False to disable temperature fallback for realtime transcription. (see temperature_fallback setting)
"realtime_frame_multiply": 15, # Only sends the audio clip to Whisper every X frames (and if its minimum this length, to prevent partial frames). (higher = less whisper updates and less processing time)
"realtime_frequency_time": 1.0, # Only sends the audio clip to Whisper every X seconds. (higher = less whisper updates and less processing time)
# OSC settings
"osc_ip": "127.0.0.1", # OSC IP address. set to "0" to disable.
"osc_port": 9000,
"osc_address": "/chatbox/input",
"osc_min_time_between_messages": 1.5, # defines the minimum time between OSC messages in seconds.
"osc_typing_indicator": True, # Display typing indicator while processing audio
"osc_convert_ascii": False,
"osc_chat_prefix": "", # Prefix for OSC messages.
"osc_chat_limit": 144, # defines the maximum length of a chat message.
"osc_time_limit": 15.0, # defines the time between OSC messages in seconds.
"osc_scroll_time_limit": 1.5, # defines the scroll time limit for scrolling OSC messages. (only used when osc_send_type is set to "scroll")
"osc_initial_time_limit": 15.0, # defines the initial time after the first message is send.
"osc_scroll_size": 3, # defines the scroll size for scrolling OSC messages. (only used when osc_send_type is set to "scroll")
"osc_max_scroll_size": 30, # defines the maximum scroll size for scrolling OSC messages. ~30 to scroll on only a single line (only used when osc_send_type is set to "scroll")
"osc_send_type": "chunks", # defines the type of OSC messages are send. Can be "scroll", "full_or_scroll", "chunks" or "full". Where "scroll" sends the text scrollung until all is send, "full_or_scroll" to only scroll if it is too long, "chunks" sends the text in chunks and "full" sends the whole text at once.
"osc_auto_processing_enabled": True, # Toggle auto sending of OSC messages on WhisperAI results. (not saved)
"osc_type_transfer": "translation_result", # defines which type of data to send. Can be "source", "translation_result" or "both".
"osc_type_transfer_split": " 🌐 ", # defines how source and translation results are split. (only used when osc_type_transfer is set to "both")
"osc_delay_until_audio_playback": False, # if enabled, OSC messages will be delayed until audio playback starts. (if no TTS is used, this will prevent messages from being send.)
"osc_delay_until_audio_playback_tag": "tts", # defines the tag used for detecting audio playback. (only used when osc_delay_until_audio_playback is enabled. Set empty to detect any audio playback)
"osc_delay_timeout": 10, # defines the timeout for delayed OSC messages. (only used when osc_delay_until_audio_playback is enabled)
"osc_server_ip": "127.0.0.1", # OSC server IP address. set to "0" to disable.
"osc_server_port": 9001,
"osc_sync_mute": False, # defines if STT is only active if VRChat mic is not muted
"osc_sync_afk": False, # defines if STT is only active if VRChat is not AFK
# websocket settings
"websocket_ip": "127.0.0.1",
"websocket_port": 5000,
"websocket_final_messages": True, # if enabled, websocket will send final messages. (internal use)
# TTS settings
#"tts_enabled": True, # enable TTS
"tts_type": "silero", # enable TTS
"tts_ai_device": "cpu", # can be "auto", "cuda" or "cpu".
"tts_answer": False, # send whisper results to TTS engine
"tts_model": ["en", "v3_en"], # TTS language and model to use
"tts_voice": "en_0", # TTS voice (one of silero tts voices, or "last" to use last used voice)
"tts_prosody_rate": "", # TTS voice speed. Can be "x-slow", "slow", "medium", "fast", "x-fast" or "" for default.
"tts_prosody_pitch": "", # TTS voice pitch. Can be "x-low", "low", "medium", "high", "x-high" or "" for default.
"tts_use_secondary_playback": False, # Play TTS audio to a secondary audio device at the same time.
"tts_secondary_playback_device": -1, # Play TTS audio to this specified audio device at the same time. (set to -1 to use default audio device)
"tts_allow_overlapping_audio": False, # Allow overlapping audio (if disabled, TTS will stop previous audio before playing new audio)
"tts_volume": 1.0, # change volume of played audio. lower than 1 reduces volume, higher increases volume.
# Plugins
"plugins": {}, # active plugins
"plugin_settings": {}, # plugin settings
"plugin_timer_timeout": 15.0, # Timer timeout for plugins
"plugin_timer": 2.0, # Timer for plugins
"plugin_timer_stopped": False,
"plugin_current_timer": 0.0
}
self._save_timer = None
def set_option(self, setting, value):
if setting in self.translate_settings:
if self.translate_settings[setting] != value:
self.translate_settings[setting] = value
# Save settings
if setting not in NON_PERSISTENT_SETTINGS and (self.settings_path is not None and not self.immutable):
self.debounced_save_yaml(self.settings_path)
else:
self.translate_settings[setting] = value
# Save settings
if setting not in NON_PERSISTENT_SETTINGS and (self.settings_path is not None and not self.immutable):
self.debounced_save_yaml(self.settings_path)
return value
def get_option(self, setting):
return self.translate_settings.get(setting, None)
def get_all_settings(self):
return self.translate_settings
def load_yaml(self, path):
self.settings_path = path
if os.path.exists(path):
with open(path, "r", encoding="utf-8") as f:
loaded_data = yaml.safe_load(f)
sanitized_data = Utilities.handle_bytes(loaded_data)
self.translate_settings.update(sanitized_data)
def debounced_save_yaml(self, path):
# Cancel the existing timer if it exists
if self._save_timer is not None:
self._save_timer.cancel()
# Start a new timer
self._save_timer = threading.Timer(DEBOUNCE_TIME, self.save_yaml, [path])
self._save_timer.start()
def save_yaml(self, path):
# If this function was called directly, cancel the timer
if self._save_timer is not None:
self._save_timer.cancel()
self._save_timer = None
# Do not save if the path is None or settings set to immutable
if self.settings_path is None or self.immutable:
print("Not saved. - No path set or immutable")
return
to_save_settings = self.translate_settings.copy()
# Remove settings that are in NON_PERSISTENT_SETTINGS
for setting in NON_PERSISTENT_SETTINGS:
if setting in to_save_settings:
del to_save_settings[setting]
with open(path, "w", encoding="utf-8") as f:
yaml.dump(to_save_settings, f)
@staticmethod
def is_argument_setting(ctx, argument_name):
return ctx.get_parameter_source(argument_name) == core.ParameterSource.COMMANDLINE
def get_argument_setting_fallback(self, ctx, argument_name, fallback_setting_name):
if self.is_argument_setting(ctx, argument_name):
return ctx.params[argument_name]
else:
return self.get_option(fallback_setting_name)
def get_available_models(self):
available_models_list = []
if '_whisper' in self.get_option("stt_type"):
from whisper import available_models
available_models_list = available_models()
# add custom models to list
if self.get_option("stt_type") == "faster_whisper":
available_models_list.insert(0, "large-v3-turbo")
available_models_list.insert(0, "medium-distilled.en")
available_models_list.insert(0, "large-distilled-v2.en")
available_models_list.insert(0, "large-distilled-v3.en")
available_models_list.insert(0, "crisper") # https://huggingface.co/nyrahealth/faster_CrisperWhisper/
available_models_list.insert(0, "small.eu")
available_models_list.insert(0, "medium.eu")
available_models_list.insert(0, "small.de")
available_models_list.insert(0, "medium.de")
available_models_list.insert(0, "large-v2.de2")
available_models_list.insert(0, "large-distilled-v3.de")
available_models_list.insert(0, "small.de-swiss")
available_models_list.insert(0, "medium.mix-jpv2")
available_models_list.insert(0, "large-v2.mix-jp")
available_models_list.insert(0, "small.jp")
available_models_list.insert(0, "medium.jp")
available_models_list.insert(0, "large-v2.jp")
available_models_list.insert(0, "medium.ko")
available_models_list.insert(0, "large-v2.ko")
available_models_list.insert(0, "small.zh")
available_models_list.insert(0, "medium.zh")
available_models_list.insert(0, "large-v2.zh")
if '_whisper' in self.get_option("stt_type"):
available_models_list.insert(0, "custom")
return available_models_list
def get_available_setting_values(self):
possible_settings = {
"ai_device": ["None", "cuda", "cpu", "direct-ml:0", "direct-ml:1"],
"model": self.get_available_models(),
"whisper_task": ["transcribe", "translate"],
"stt_type": ["faster_whisper", "original_whisper", "transformer_whisper", "medusa_whisper", "seamless_m4t", "mms", "speech_t5", "wav2vec_bert", "nemo_canary", ""],
"tts_type": ["silero", "f5_e2", ""],
"tts_ai_device": ["cuda", "cpu"],
"txt_translator_device": ["cuda", "cpu"],
"txt_translator": ["", "NLLB200_CT2", "NLLB200", "M2M100", "Seamless_M4T"],
"txt_translator_size": ["small", "medium", "large"],
"txt_translator_precision": ["float32", "float16", "int16", "int8_float16", "int8", "bfloat16", "int8_bfloat16", "4bit", "8bit"],
"tts_prosody_rate": ["", "x-slow", "slow", "medium", "fast", "x-fast"],
"tts_prosody_pitch": ["", "x-low", "low", "medium", "high", "x-high"],
"whisper_precision": ["float32", "float16", "int16", "int8_float16", "int8", "bfloat16", "int8_bfloat16", "4bit", "8bit"],
"realtime_whisper_model": [""] + self.get_available_models(),
"realtime_whisper_precision": ["float32", "float16", "int16", "int8_float16", "int8", "bfloat16", "int8_bfloat16", "4bit", "8bit"],
"osc_type_transfer": ["source", "translation_result", "both", "both_inverted"],
"osc_send_type": ["full", "full_or_scroll", "scroll", "chunks"],
"denoise_audio": ["", "noise_reduce", "deepfilter"],
}
return possible_settings
# Legacy Functions
def LoadYaml(self, path):
self.load_yaml(path)
def SetOption(self, setting, value):
return self.set_option(setting, value)
def GetOption(self, setting):
return self.get_option(setting)
# Legacy code ---
SETTINGS = SettingsManager()
def SetOption(setting, value):
return SETTINGS.set_option(setting, value)
def GetOption(setting):
return SETTINGS.get_option(setting)