No VAD segmentation option (#182)

* Add support for disabled VAD mode and enhance CMake configuration * Enhance VAD processing and transcription filter data structure with additional comments and logic adjustments * Enhance VAD processing with improved logging, adjust single segment default, and update inference handling * Refactor whisper parameter handling and enhance utility functions for better clarity and maintainability * Add whisper parameters group properties and clean up related code * Add whisper parameters handling and update related files for improved functionality * Refactor whisper parameter type casting for improved clarity and consistency * trigger build * Fix logging message to use the correct variable for saved sentence
locaal-ai · Nov 25, 2024 · a32e327 · a32e327
1 parent 04a6f6a
commit a32e327
Show file tree

Hide file tree

Showing 12 changed files with 515 additions and 163 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -59,7 +59,7 @@ if(WIN32)
         "cpu"
         CACHE STRING "Acceleration to use")
   endif()
-  set_property(CACHE ACCELERATION PROPERTY STRINGS "cpu" "hipblas" "cuda")
+  set_property(CACHE ACCELERATION PROPERTY STRINGS "cpu" "hipblas" "cuda" "vulkan")
 endif()
 
 include(cmake/BuildWhispercpp.cmake)
@@ -101,6 +101,11 @@ include(cmake/BuildICU.cmake)
 target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE ICU)
 target_include_directories(${CMAKE_PROJECT_NAME} SYSTEM PUBLIC ${ICU_INCLUDE_DIR})
 
+# check env var for extra verbose logging
+if(DEFINED ENV{LOCALVOCAL_EXTRA_VERBOSE})
+  target_compile_definitions(${CMAKE_PROJECT_NAME} PRIVATE LOCALVOCAL_EXTRA_VERBOSE)
+endif()
+
 target_sources(
   ${CMAKE_PROJECT_NAME}
   PRIVATE src/plugin-main.c
@@ -117,6 +122,7 @@ target_sources(
           src/whisper-utils/whisper-processing.cpp
           src/whisper-utils/whisper-utils.cpp
           src/whisper-utils/whisper-model-utils.cpp
+          src/whisper-utils/whisper-params.cpp
           src/whisper-utils/silero-vad-onnx.cpp
           src/whisper-utils/token-buffer-thread.cpp
           src/whisper-utils/vad-processing.cpp

diff --git a/data/locale/en-US.ini b/data/locale/en-US.ini
@@ -13,30 +13,9 @@ external_model_file="External model file"
 whisper_parameters="Whisper Model Parameters"
 language="Input Language"
 whisper_sampling_method="Whisper Sampling Method"
-n_threads="Number of threads"
-n_max_text_ctx="Max text context"
-translate="Translate"
 translate_local="Local Translation"
 translate_cloud="Cloud Translation"
-no_context="No context"
-single_segment="Single segment"
-print_special="Print special"
-print_progress="Print progress"
-print_realtime="Print realtime"
-print_timestamps="Print timestamps"
-token_timestamps="Token timestamps"
-thold_pt="Token prob. threshold"
-thold_ptsum="Token sum prob. threshold"
-max_len="Max length in chars"
-split_on_word="Split on word"
-max_tokens="Max tokens"
 speed_up="Speed up"
-initial_prompt="Initial prompt"
-suppress_blank="Suppress blank"
-suppress_non_speech_tokens="Suppress non-speech tokens"
-temperature="Temperature"
-max_initial_ts="Max initial timestamps"
-length_penalty="Length penalty"
 save_srt="Save in SRT format"
 truncate_output_file="Truncate file on new sentence"
 only_while_recording="Write output only while recording"
@@ -92,11 +71,51 @@ partial_latency="Latency (ms)"
 vad_mode="VAD Mode"
 Active_VAD="Active VAD"
 Hybrid_VAD="Hybrid VAD"
+No_VAD="No VAD"
 translate_only_full_sentences="Translate only full sentences"
 duration_filter_threshold="Duration filter"
 segment_duration="Segment duration"
 n_context_sentences="# Context sentences"
 max_sub_duration="Max. sub duration (ms)"
+# Whisper model parameters
+strategy="Strategy"
+n_threads="Number of threads"
+n_max_text_ctx="Max text context"
+offset_ms="Offset (ms)"
+duration_ms="Duration (ms)"
+whisper_translate="Translate"
+no_context="No context"
+no_timestamps="No timestamps"
+single_segment="Single segment"
+print_special="Print special"
+print_progress="Print progress"
+print_realtime="Print realtime"
+print_timestamps="Print timestamps"
+token_timestamps="Token timestamps"
+thold_pt="Token prob. threshold"
+thold_ptsum="Token sum prob. threshold"
+max_len="Max length in chars"
+split_on_word="Split on word"
+max_tokens="Max tokens"
+debug_mode="Debug mode"
+audio_ctx="Audio context"
+tdrz_enable="Enable TDRZ"
+suppress_regex="Suppress regex"
+initial_prompt="Initial prompt"
+language="Input Language"
+detect_language="Detect language"
+suppress_blank="Suppress blank"
+suppress_non_speech_tokens="Suppress non-speech tokens"
+temperature="Temperature"
+max_initial_ts="Max initial timestamps"
+length_penalty="Length penalty"
+temperature_inc="Temperature increment"
+entropy_thold="Entropy threshold"
+logprob_thold="Logprob threshold"
+no_speech_thold="No speech threshold"
+greedy.best_of="Greedy best of"
+beam_search.beam_size="Beam size"
+beam_search.patience="Patience"
 Google-Cloud-Translation="Google Cloud Translation"
 Microsoft-Translator="Microsoft Azure Translator"
 Amazon-Translate="AWS Translate"

diff --git a/src/transcription-filter-callbacks.cpp b/src/transcription-filter-callbacks.cpp
@@ -143,7 +143,9 @@ void send_sentence_to_file(struct transcription_filter_data *gf,
 		openmode |= std::ios::app;
 	}
 	if (!gf->save_srt) {
-		// Write raw sentence to file
+		obs_log(gf->log_level, "Saving sentence '%s' to file %s", sentence.c_str(),
+			gf->output_file_path.c_str());
+		// Write raw sentence to text file (non-srt format)
 		try {
 			std::ofstream output_file(file_path, openmode);
 			output_file << sentence << std::endl;

diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h
@@ -87,6 +87,7 @@ struct transcription_filter_data {
 	bool partial_transcription = false;
 	int partial_latency = 1000;
 	float duration_filter_threshold = 2.25f;
+	// Duration of the target segment buffer in ms
 	int segment_duration = 7000;
 
 	// Cloud translation options

diff --git a/src/transcription-filter-properties.cpp b/src/transcription-filter-properties.cpp
@@ -8,13 +8,15 @@
 #include "transcription-filter-utils.h"
 #include "whisper-utils/whisper-language.h"
 #include "whisper-utils/vad-processing.h"
+#include "whisper-utils/whisper-params.h"
 #include "model-utils/model-downloader-types.h"
 #include "translation/language_codes.h"
 #include "ui/filter-replace-dialog.h"
 #include "ui/filter-replace-utils.h"
 
 #include <string>
 #include <vector>
+#include "whisper-utils/whisper-utils.h"
 
 bool translation_options_callback(obs_properties_t *props, obs_property_t *property,
 				  obs_data_t *settings)
@@ -479,6 +481,7 @@ void add_advanced_group_properties(obs_properties_t *ppts, struct transcription_
 	obs_property_t *vad_mode_list =
 		obs_properties_add_list(advanced_config_group, "vad_mode", MT_("vad_mode"),
 					OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_INT);
+	obs_property_list_add_int(vad_mode_list, MT_("No_VAD"), VAD_MODE_DISABLED);
 	obs_property_list_add_int(vad_mode_list, MT_("Active_VAD"), VAD_MODE_ACTIVE);
 	obs_property_list_add_int(vad_mode_list, MT_("Hybrid_VAD"), VAD_MODE_HYBRID);
 	// add vad threshold slider
@@ -528,82 +531,6 @@ void add_logging_group_properties(obs_properties_t *ppts)
 	obs_property_list_add_int(list, "WARNING", LOG_WARNING);
 }
 
-void add_whisper_params_group_properties(obs_properties_t *ppts)
-{
-	obs_properties_t *whisper_params_group = obs_properties_create();
-	obs_properties_add_group(ppts, "whisper_params_group", MT_("whisper_parameters"),
-				 OBS_GROUP_NORMAL, whisper_params_group);
-
-	obs_property_t *whisper_sampling_method_list = obs_properties_add_list(
-		whisper_params_group, "whisper_sampling_method", MT_("whisper_sampling_method"),
-		OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_INT);
-	obs_property_list_add_int(whisper_sampling_method_list, "Beam search",
-				  WHISPER_SAMPLING_BEAM_SEARCH);
-	obs_property_list_add_int(whisper_sampling_method_list, "Greedy", WHISPER_SAMPLING_GREEDY);
-
-	// add int slider for context sentences
-	obs_properties_add_int_slider(whisper_params_group, "n_context_sentences",
-				      MT_("n_context_sentences"), 0, 5, 1);
-
-	// int n_threads;
-	obs_properties_add_int_slider(whisper_params_group, "n_threads", MT_("n_threads"), 1, 8, 1);
-	// int n_max_text_ctx;     // max tokens to use from past text as prompt for the decoder
-	obs_properties_add_int_slider(whisper_params_group, "n_max_text_ctx", MT_("n_max_text_ctx"),
-				      0, 16384, 100);
-	// int offset_ms;          // start offset in ms
-	// int duration_ms;        // audio duration to process in ms
-	// bool translate;
-	obs_properties_add_bool(whisper_params_group, "whisper_translate",
-				MT_("whisper_translate"));
-	// bool no_context;        // do not use past transcription (if any) as initial prompt for the decoder
-	obs_properties_add_bool(whisper_params_group, "no_context", MT_("no_context"));
-	// bool single_segment;    // force single segment output (useful for streaming)
-	obs_properties_add_bool(whisper_params_group, "single_segment", MT_("single_segment"));
-	// bool print_special;     // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
-	obs_properties_add_bool(whisper_params_group, "print_special", MT_("print_special"));
-	// bool print_progress;    // print progress information
-	obs_properties_add_bool(whisper_params_group, "print_progress", MT_("print_progress"));
-	// bool print_realtime;    // print results from within whisper.cpp (avoid it, use callback instead)
-	obs_properties_add_bool(whisper_params_group, "print_realtime", MT_("print_realtime"));
-	// bool print_timestamps;  // print timestamps for each text segment when printing realtime
-	obs_properties_add_bool(whisper_params_group, "print_timestamps", MT_("print_timestamps"));
-	// bool  token_timestamps; // enable token-level timestamps
-	obs_properties_add_bool(whisper_params_group, "token_timestamps", MT_("token_timestamps"));
-	// enable DTW timestamps
-	obs_properties_add_bool(whisper_params_group, "dtw_token_timestamps",
-				MT_("dtw_token_timestamps"));
-	// float thold_pt;         // timestamp token probability threshold (~0.01)
-	obs_properties_add_float_slider(whisper_params_group, "thold_pt", MT_("thold_pt"), 0.0f,
-					1.0f, 0.05f);
-	// float thold_ptsum;      // timestamp token sum probability threshold (~0.01)
-	obs_properties_add_float_slider(whisper_params_group, "thold_ptsum", MT_("thold_ptsum"),
-					0.0f, 1.0f, 0.05f);
-	// int   max_len;          // max segment length in characters
-	obs_properties_add_int_slider(whisper_params_group, "max_len", MT_("max_len"), 0, 100, 1);
-	// bool  split_on_word;    // split on word rather than on token (when used with max_len)
-	obs_properties_add_bool(whisper_params_group, "split_on_word", MT_("split_on_word"));
-	// int   max_tokens;       // max tokens per segment (0 = no limit)
-	obs_properties_add_int_slider(whisper_params_group, "max_tokens", MT_("max_tokens"), 0, 100,
-				      1);
-	// const char * initial_prompt;
-	obs_properties_add_text(whisper_params_group, "initial_prompt", MT_("initial_prompt"),
-				OBS_TEXT_DEFAULT);
-	// bool suppress_blank
-	obs_properties_add_bool(whisper_params_group, "suppress_blank", MT_("suppress_blank"));
-	// bool suppress_non_speech_tokens
-	obs_properties_add_bool(whisper_params_group, "suppress_non_speech_tokens",
-				MT_("suppress_non_speech_tokens"));
-	// float temperature
-	obs_properties_add_float_slider(whisper_params_group, "temperature", MT_("temperature"),
-					0.0f, 1.0f, 0.05f);
-	// float max_initial_ts
-	obs_properties_add_float_slider(whisper_params_group, "max_initial_ts",
-					MT_("max_initial_ts"), 0.0f, 1.0f, 0.05f);
-	// float length_penalty
-	obs_properties_add_float_slider(whisper_params_group, "length_penalty",
-					MT_("length_penalty"), -1.0f, 1.0f, 0.1f);
-}
-
 void add_general_group_properties(obs_properties_t *ppts)
 {
 	// add "General" group
@@ -742,28 +669,5 @@ void transcription_filter_defaults(obs_data_t *s)
 	obs_data_set_default_string(s, "translate_cloud_region", "eastus");
 
 	// Whisper parameters
-	obs_data_set_default_int(s, "whisper_sampling_method", WHISPER_SAMPLING_BEAM_SEARCH);
-	obs_data_set_default_int(s, "n_context_sentences", 0);
-	obs_data_set_default_string(s, "initial_prompt", "");
-	obs_data_set_default_int(s, "n_threads", 4);
-	obs_data_set_default_int(s, "n_max_text_ctx", 16384);
-	obs_data_set_default_bool(s, "whisper_translate", false);
-	obs_data_set_default_bool(s, "no_context", true);
-	obs_data_set_default_bool(s, "single_segment", true);
-	obs_data_set_default_bool(s, "print_special", false);
-	obs_data_set_default_bool(s, "print_progress", false);
-	obs_data_set_default_bool(s, "print_realtime", false);
-	obs_data_set_default_bool(s, "print_timestamps", false);
-	obs_data_set_default_bool(s, "token_timestamps", false);
-	obs_data_set_default_bool(s, "dtw_token_timestamps", false);
-	obs_data_set_default_double(s, "thold_pt", 0.01);
-	obs_data_set_default_double(s, "thold_ptsum", 0.01);
-	obs_data_set_default_int(s, "max_len", 0);
-	obs_data_set_default_bool(s, "split_on_word", true);
-	obs_data_set_default_int(s, "max_tokens", 50);
-	obs_data_set_default_bool(s, "suppress_blank", false);
-	obs_data_set_default_bool(s, "suppress_non_speech_tokens", false);
-	obs_data_set_default_double(s, "temperature", 0.1);
-	obs_data_set_default_double(s, "max_initial_ts", 1.0);
-	obs_data_set_default_double(s, "length_penalty", -1.0);
+	apply_whisper_params_defaults_on_settings(s);
 }
diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp
@@ -25,6 +25,7 @@
 #include "whisper-utils/whisper-language.h"
 #include "whisper-utils/whisper-model-utils.h"
 #include "whisper-utils/whisper-utils.h"
+#include "whisper-utils/whisper-params.h"
 #include "translation/language_codes.h"
 #include "translation/translation-utils.h"
 #include "translation/translation.h"
@@ -364,51 +365,27 @@ void transcription_filter_update(void *data, obs_data_t *s)
 		gf->sentence_psum_accept_thresh =
 			(float)obs_data_get_double(s, "sentence_psum_accept_thresh");
 
-		gf->whisper_params = whisper_full_default_params(
-			(whisper_sampling_strategy)obs_data_get_int(s, "whisper_sampling_method"));
-		gf->whisper_params.duration_ms = (int)obs_data_get_int(s, "buffer_size_msec");
+		apply_whisper_params_from_settings(gf->whisper_params, s);
+
 		if (!new_translate || gf->translation_model_index != "whisper-based-translation") {
 			const char *whisper_language_select =
 				obs_data_get_string(s, "whisper_language_select");
-			gf->whisper_params.language = (whisper_language_select != nullptr &&
-						       strlen(whisper_language_select) > 0)
-							      ? whisper_language_select
-							      : "auto";
+			const bool language_selected = whisper_language_select != nullptr &&
+						       strlen(whisper_language_select) > 0;
+			gf->whisper_params.language = (language_selected) ? whisper_language_select
+									  : "auto";
+			gf->whisper_params.detect_language = !language_selected;
 		} else {
 			// take the language from gf->target_lang
 			if (language_codes_to_whisper.count(gf->target_lang) > 0) {
 				gf->whisper_params.language =
 					language_codes_to_whisper[gf->target_lang].c_str();
+				gf->whisper_params.detect_language = false;
 			} else {
 				gf->whisper_params.language = "auto";
+				gf->whisper_params.detect_language = true;
 			}
 		}
-		gf->whisper_params.initial_prompt =
-			obs_data_get_string(s, "initial_prompt") != nullptr
-				? obs_data_get_string(s, "initial_prompt")
-				: "";
-		gf->whisper_params.n_threads = (int)obs_data_get_int(s, "n_threads");
-		gf->whisper_params.n_max_text_ctx = (int)obs_data_get_int(s, "n_max_text_ctx");
-		gf->whisper_params.translate = obs_data_get_bool(s, "whisper_translate");
-		gf->whisper_params.no_context = obs_data_get_bool(s, "no_context");
-		gf->whisper_params.single_segment = obs_data_get_bool(s, "single_segment");
-		gf->whisper_params.print_special = obs_data_get_bool(s, "print_special");
-		gf->whisper_params.print_progress = obs_data_get_bool(s, "print_progress");
-		gf->whisper_params.print_realtime = obs_data_get_bool(s, "print_realtime");
-		gf->whisper_params.print_timestamps = obs_data_get_bool(s, "print_timestamps");
-		gf->whisper_params.token_timestamps = obs_data_get_bool(s, "token_timestamps");
-		gf->whisper_params.thold_pt = (float)obs_data_get_double(s, "thold_pt");
-		gf->whisper_params.thold_ptsum = (float)obs_data_get_double(s, "thold_ptsum");
-		gf->whisper_params.max_len = (int)obs_data_get_int(s, "max_len");
-		gf->whisper_params.split_on_word = obs_data_get_bool(s, "split_on_word");
-		gf->whisper_params.max_tokens = (int)obs_data_get_int(s, "max_tokens");
-		gf->whisper_params.suppress_blank = obs_data_get_bool(s, "suppress_blank");
-		gf->whisper_params.suppress_non_speech_tokens =
-			obs_data_get_bool(s, "suppress_non_speech_tokens");
-		gf->whisper_params.temperature = (float)obs_data_get_double(s, "temperature");
-		gf->whisper_params.max_initial_ts = (float)obs_data_get_double(s, "max_initial_ts");
-		gf->whisper_params.length_penalty = (float)obs_data_get_double(s, "length_penalty");
-		gf->whisper_params.no_timestamps = true;
 
 		if (gf->vad) {
 			const float vad_threshold = (float)obs_data_get_double(s, "vad_threshold");