From a32e3274777ac0bf00c4679e0d351657569e45ba Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Sun, 24 Nov 2024 20:28:46 -0800
Subject: [PATCH] No VAD segmentation option (#182)

* Add support for disabled VAD mode and enhance CMake configuration

* Enhance VAD processing and transcription filter data structure with additional comments and logic adjustments

* Enhance VAD processing with improved logging, adjust single segment default, and update inference handling

* Refactor whisper parameter handling and enhance utility functions for better clarity and maintainability

* Add whisper parameters group properties and clean up related code

* Add whisper parameters handling and update related files for improved functionality

* Refactor whisper parameter type casting for improved clarity and consistency

* trigger build

* Fix logging message to use the correct variable for saved sentence
---
 CMakeLists.txt                           |   8 +-
 data/locale/en-US.ini                    |  61 ++++---
 src/transcription-filter-callbacks.cpp   |   4 +-
 src/transcription-filter-data.h          |   1 +
 src/transcription-filter-properties.cpp  | 104 +-----------
 src/transcription-filter.cpp             |  43 ++---
 src/whisper-utils/vad-processing.cpp     |  88 +++++++++++
 src/whisper-utils/vad-processing.h       |  40 +++++
 src/whisper-utils/whisper-params.cpp     | 192 +++++++++++++++++++++++
 src/whisper-utils/whisper-params.h       |  52 ++++++
 src/whisper-utils/whisper-processing.cpp |  19 ++-
 src/whisper-utils/whisper-utils.h        |  66 +++++++-
 12 files changed, 515 insertions(+), 163 deletions(-)
 create mode 100644 src/whisper-utils/whisper-params.cpp
 create mode 100644 src/whisper-utils/whisper-params.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c18ca66..4ccb91d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -59,7 +59,7 @@ if(WIN32)
         "cpu"
         CACHE STRING "Acceleration to use")
   endif()
-  set_property(CACHE ACCELERATION PROPERTY STRINGS "cpu" "hipblas" "cuda")
+  set_property(CACHE ACCELERATION PROPERTY STRINGS "cpu" "hipblas" "cuda" "vulkan")
 endif()
 
 include(cmake/BuildWhispercpp.cmake)
@@ -101,6 +101,11 @@ include(cmake/BuildICU.cmake)
 target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE ICU)
 target_include_directories(${CMAKE_PROJECT_NAME} SYSTEM PUBLIC ${ICU_INCLUDE_DIR})
 
+# check env var for extra verbose logging
+if(DEFINED ENV{LOCALVOCAL_EXTRA_VERBOSE})
+  target_compile_definitions(${CMAKE_PROJECT_NAME} PRIVATE LOCALVOCAL_EXTRA_VERBOSE)
+endif()
+
 target_sources(
   ${CMAKE_PROJECT_NAME}
   PRIVATE src/plugin-main.c
@@ -117,6 +122,7 @@ target_sources(
           src/whisper-utils/whisper-processing.cpp
           src/whisper-utils/whisper-utils.cpp
           src/whisper-utils/whisper-model-utils.cpp
+          src/whisper-utils/whisper-params.cpp
           src/whisper-utils/silero-vad-onnx.cpp
           src/whisper-utils/token-buffer-thread.cpp
           src/whisper-utils/vad-processing.cpp
diff --git a/data/locale/en-US.ini b/data/locale/en-US.ini
index 3f94d4e..0d326fb 100644
--- a/data/locale/en-US.ini
+++ b/data/locale/en-US.ini
@@ -13,30 +13,9 @@ external_model_file="External model file"
 whisper_parameters="Whisper Model Parameters"
 language="Input Language"
 whisper_sampling_method="Whisper Sampling Method"
-n_threads="Number of threads"
-n_max_text_ctx="Max text context"
-translate="Translate"
 translate_local="Local Translation"
 translate_cloud="Cloud Translation"
-no_context="No context"
-single_segment="Single segment"
-print_special="Print special"
-print_progress="Print progress"
-print_realtime="Print realtime"
-print_timestamps="Print timestamps"
-token_timestamps="Token timestamps"
-thold_pt="Token prob. threshold"
-thold_ptsum="Token sum prob. threshold"
-max_len="Max length in chars"
-split_on_word="Split on word"
-max_tokens="Max tokens"
 speed_up="Speed up"
-initial_prompt="Initial prompt"
-suppress_blank="Suppress blank"
-suppress_non_speech_tokens="Suppress non-speech tokens"
-temperature="Temperature"
-max_initial_ts="Max initial timestamps"
-length_penalty="Length penalty"
 save_srt="Save in SRT format"
 truncate_output_file="Truncate file on new sentence"
 only_while_recording="Write output only while recording"
@@ -92,11 +71,51 @@ partial_latency="Latency (ms)"
 vad_mode="VAD Mode"
 Active_VAD="Active VAD"
 Hybrid_VAD="Hybrid VAD"
+No_VAD="No VAD"
 translate_only_full_sentences="Translate only full sentences"
 duration_filter_threshold="Duration filter"
 segment_duration="Segment duration"
 n_context_sentences="# Context sentences"
 max_sub_duration="Max. sub duration (ms)"
+# Whisper model parameters
+strategy="Strategy"
+n_threads="Number of threads"
+n_max_text_ctx="Max text context"
+offset_ms="Offset (ms)"
+duration_ms="Duration (ms)"
+whisper_translate="Translate"
+no_context="No context"
+no_timestamps="No timestamps"
+single_segment="Single segment"
+print_special="Print special"
+print_progress="Print progress"
+print_realtime="Print realtime"
+print_timestamps="Print timestamps"
+token_timestamps="Token timestamps"
+thold_pt="Token prob. threshold"
+thold_ptsum="Token sum prob. threshold"
+max_len="Max length in chars"
+split_on_word="Split on word"
+max_tokens="Max tokens"
+debug_mode="Debug mode"
+audio_ctx="Audio context"
+tdrz_enable="Enable TDRZ"
+suppress_regex="Suppress regex"
+initial_prompt="Initial prompt"
+language="Input Language"
+detect_language="Detect language"
+suppress_blank="Suppress blank"
+suppress_non_speech_tokens="Suppress non-speech tokens"
+temperature="Temperature"
+max_initial_ts="Max initial timestamps"
+length_penalty="Length penalty"
+temperature_inc="Temperature increment"
+entropy_thold="Entropy threshold"
+logprob_thold="Logprob threshold"
+no_speech_thold="No speech threshold"
+greedy.best_of="Greedy best of"
+beam_search.beam_size="Beam size"
+beam_search.patience="Patience"
 Google-Cloud-Translation="Google Cloud Translation"
 Microsoft-Translator="Microsoft Azure Translator"
 Amazon-Translate="AWS Translate"
diff --git a/src/transcription-filter-callbacks.cpp b/src/transcription-filter-callbacks.cpp
index 5938863..98be9c7 100644
--- a/src/transcription-filter-callbacks.cpp
+++ b/src/transcription-filter-callbacks.cpp
@@ -143,7 +143,9 @@ void send_sentence_to_file(struct transcription_filter_data *gf,
 		openmode |= std::ios::app;
 	}
 	if (!gf->save_srt) {
-		// Write raw sentence to file
+		obs_log(gf->log_level, "Saving sentence '%s' to file %s", sentence.c_str(),
+			gf->output_file_path.c_str());
+		// Write raw sentence to text file (non-srt format)
 		try {
 			std::ofstream output_file(file_path, openmode);
 			output_file << sentence << std::endl;
diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h
index f96c7d9..2134e1c 100644
--- a/src/transcription-filter-data.h
+++ b/src/transcription-filter-data.h
@@ -87,6 +87,7 @@ struct transcription_filter_data {
 	bool partial_transcription = false;
 	int partial_latency = 1000;
 	float duration_filter_threshold = 2.25f;
+	// Duration of the target segment buffer in ms
 	int segment_duration = 7000;
 
 	// Cloud translation options
diff --git a/src/transcription-filter-properties.cpp b/src/transcription-filter-properties.cpp
index 7b9c2f4..cce2666 100644
--- a/src/transcription-filter-properties.cpp
+++ b/src/transcription-filter-properties.cpp
@@ -8,6 +8,7 @@
 #include "transcription-filter-utils.h"
 #include "whisper-utils/whisper-language.h"
 #include "whisper-utils/vad-processing.h"
+#include "whisper-utils/whisper-params.h"
 #include "model-utils/model-downloader-types.h"
 #include "translation/language_codes.h"
 #include "ui/filter-replace-dialog.h"
@@ -15,6 +16,7 @@
 
 #include <string>
 #include <vector>
+#include "whisper-utils/whisper-utils.h"
 
 bool translation_options_callback(obs_properties_t *props, obs_property_t *property,
 				  obs_data_t *settings)
@@ -479,6 +481,7 @@ void add_advanced_group_properties(obs_properties_t *ppts, struct transcription_
 	obs_property_t *vad_mode_list =
 		obs_properties_add_list(advanced_config_group, "vad_mode", MT_("vad_mode"),
 					OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_INT);
+	obs_property_list_add_int(vad_mode_list, MT_("No_VAD"), VAD_MODE_DISABLED);
 	obs_property_list_add_int(vad_mode_list, MT_("Active_VAD"), VAD_MODE_ACTIVE);
 	obs_property_list_add_int(vad_mode_list, MT_("Hybrid_VAD"), VAD_MODE_HYBRID);
 	// add vad threshold slider
@@ -528,82 +531,6 @@ void add_logging_group_properties(obs_properties_t *ppts)
 	obs_property_list_add_int(list, "WARNING", LOG_WARNING);
 }
 
-void add_whisper_params_group_properties(obs_properties_t *ppts)
-{
-	obs_properties_t *whisper_params_group = obs_properties_create();
-	obs_properties_add_group(ppts, "whisper_params_group", MT_("whisper_parameters"),
-				 OBS_GROUP_NORMAL, whisper_params_group);
-
-	obs_property_t *whisper_sampling_method_list = obs_properties_add_list(
-		whisper_params_group, "whisper_sampling_method", MT_("whisper_sampling_method"),
-		OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_INT);
-	obs_property_list_add_int(whisper_sampling_method_list, "Beam search",
-				  WHISPER_SAMPLING_BEAM_SEARCH);
-	obs_property_list_add_int(whisper_sampling_method_list, "Greedy", WHISPER_SAMPLING_GREEDY);
-
-	// add int slider for context sentences
-	obs_properties_add_int_slider(whisper_params_group, "n_context_sentences",
-				      MT_("n_context_sentences"), 0, 5, 1);
-
-	// int n_threads;
-	obs_properties_add_int_slider(whisper_params_group, "n_threads", MT_("n_threads"), 1, 8, 1);
-	// int n_max_text_ctx;     // max tokens to use from past text as prompt for the decoder
-	obs_properties_add_int_slider(whisper_params_group, "n_max_text_ctx", MT_("n_max_text_ctx"),
-				      0, 16384, 100);
-	// int offset_ms;          // start offset in ms
-	// int duration_ms;        // audio duration to process in ms
-	// bool translate;
-	obs_properties_add_bool(whisper_params_group, "whisper_translate",
-				MT_("whisper_translate"));
-	// bool no_context;        // do not use past transcription (if any) as initial prompt for the decoder
-	obs_properties_add_bool(whisper_params_group, "no_context", MT_("no_context"));
-	// bool single_segment;    // force single segment output (useful for streaming)
-	obs_properties_add_bool(whisper_params_group, "single_segment", MT_("single_segment"));
-	// bool print_special;     // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
-	obs_properties_add_bool(whisper_params_group, "print_special", MT_("print_special"));
-	// bool print_progress;    // print progress information
-	obs_properties_add_bool(whisper_params_group, "print_progress", MT_("print_progress"));
-	// bool print_realtime;    // print results from within whisper.cpp (avoid it, use callback instead)
-	obs_properties_add_bool(whisper_params_group, "print_realtime", MT_("print_realtime"));
-	// bool print_timestamps;  // print timestamps for each text segment when printing realtime
-	obs_properties_add_bool(whisper_params_group, "print_timestamps", MT_("print_timestamps"));
-	// bool  token_timestamps; // enable token-level timestamps
-	obs_properties_add_bool(whisper_params_group, "token_timestamps", MT_("token_timestamps"));
-	// enable DTW timestamps
-	obs_properties_add_bool(whisper_params_group, "dtw_token_timestamps",
-				MT_("dtw_token_timestamps"));
-	// float thold_pt;         // timestamp token probability threshold (~0.01)
-	obs_properties_add_float_slider(whisper_params_group, "thold_pt", MT_("thold_pt"), 0.0f,
-					1.0f, 0.05f);
-	// float thold_ptsum;      // timestamp token sum probability threshold (~0.01)
-	obs_properties_add_float_slider(whisper_params_group, "thold_ptsum", MT_("thold_ptsum"),
-					0.0f, 1.0f, 0.05f);
-	// int   max_len;          // max segment length in characters
-	obs_properties_add_int_slider(whisper_params_group, "max_len", MT_("max_len"), 0, 100, 1);
-	// bool  split_on_word;    // split on word rather than on token (when used with max_len)
-	obs_properties_add_bool(whisper_params_group, "split_on_word", MT_("split_on_word"));
-	// int   max_tokens;       // max tokens per segment (0 = no limit)
-	obs_properties_add_int_slider(whisper_params_group, "max_tokens", MT_("max_tokens"), 0, 100,
-				      1);
-	// const char * initial_prompt;
-	obs_properties_add_text(whisper_params_group, "initial_prompt", MT_("initial_prompt"),
-				OBS_TEXT_DEFAULT);
-	// bool suppress_blank
-	obs_properties_add_bool(whisper_params_group, "suppress_blank", MT_("suppress_blank"));
-	// bool suppress_non_speech_tokens
-	obs_properties_add_bool(whisper_params_group, "suppress_non_speech_tokens",
-				MT_("suppress_non_speech_tokens"));
-	// float temperature
-	obs_properties_add_float_slider(whisper_params_group, "temperature", MT_("temperature"),
-					0.0f, 1.0f, 0.05f);
-	// float max_initial_ts
-	obs_properties_add_float_slider(whisper_params_group, "max_initial_ts",
-					MT_("max_initial_ts"), 0.0f, 1.0f, 0.05f);
-	// float length_penalty
-	obs_properties_add_float_slider(whisper_params_group, "length_penalty",
-					MT_("length_penalty"), -1.0f, 1.0f, 0.1f);
-}
-
 void add_general_group_properties(obs_properties_t *ppts)
 {
 	// add "General" group
@@ -742,28 +669,5 @@ void transcription_filter_defaults(obs_data_t *s)
 	obs_data_set_default_string(s, "translate_cloud_region", "eastus");
 
 	// Whisper parameters
-	obs_data_set_default_int(s, "whisper_sampling_method", WHISPER_SAMPLING_BEAM_SEARCH);
-	obs_data_set_default_int(s, "n_context_sentences", 0);
-	obs_data_set_default_string(s, "initial_prompt", "");
-	obs_data_set_default_int(s, "n_threads", 4);
-	obs_data_set_default_int(s, "n_max_text_ctx", 16384);
-	obs_data_set_default_bool(s, "whisper_translate", false);
-	obs_data_set_default_bool(s, "no_context", true);
-	obs_data_set_default_bool(s, "single_segment", true);
-	obs_data_set_default_bool(s, "print_special", false);
-	obs_data_set_default_bool(s, "print_progress", false);
-	obs_data_set_default_bool(s, "print_realtime", false);
-	obs_data_set_default_bool(s, "print_timestamps", false);
-	obs_data_set_default_bool(s, "token_timestamps", false);
-	obs_data_set_default_bool(s, "dtw_token_timestamps", false);
-	obs_data_set_default_double(s, "thold_pt", 0.01);
-	obs_data_set_default_double(s, "thold_ptsum", 0.01);
-	obs_data_set_default_int(s, "max_len", 0);
-	obs_data_set_default_bool(s, "split_on_word", true);
-	obs_data_set_default_int(s, "max_tokens", 50);
-	obs_data_set_default_bool(s, "suppress_blank", false);
-	obs_data_set_default_bool(s, "suppress_non_speech_tokens", false);
-	obs_data_set_default_double(s, "temperature", 0.1);
-	obs_data_set_default_double(s, "max_initial_ts", 1.0);
-	obs_data_set_default_double(s, "length_penalty", -1.0);
+	apply_whisper_params_defaults_on_settings(s);
 }
diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp
index 5e13d52..ebd765d 100644
--- a/src/transcription-filter.cpp
+++ b/src/transcription-filter.cpp
@@ -25,6 +25,7 @@
 #include "whisper-utils/whisper-language.h"
 #include "whisper-utils/whisper-model-utils.h"
 #include "whisper-utils/whisper-utils.h"
+#include "whisper-utils/whisper-params.h"
 #include "translation/language_codes.h"
 #include "translation/translation-utils.h"
 #include "translation/translation.h"
@@ -364,51 +365,27 @@ void transcription_filter_update(void *data, obs_data_t *s)
 		gf->sentence_psum_accept_thresh =
 			(float)obs_data_get_double(s, "sentence_psum_accept_thresh");
 
-		gf->whisper_params = whisper_full_default_params(
-			(whisper_sampling_strategy)obs_data_get_int(s, "whisper_sampling_method"));
-		gf->whisper_params.duration_ms = (int)obs_data_get_int(s, "buffer_size_msec");
+		apply_whisper_params_from_settings(gf->whisper_params, s);
+
 		if (!new_translate || gf->translation_model_index != "whisper-based-translation") {
 			const char *whisper_language_select =
 				obs_data_get_string(s, "whisper_language_select");
-			gf->whisper_params.language = (whisper_language_select != nullptr &&
-						       strlen(whisper_language_select) > 0)
-							      ? whisper_language_select
-							      : "auto";
+			const bool language_selected = whisper_language_select != nullptr &&
+						       strlen(whisper_language_select) > 0;
+			gf->whisper_params.language = (language_selected) ? whisper_language_select
+									  : "auto";
+			gf->whisper_params.detect_language = !language_selected;
 		} else {
 			// take the language from gf->target_lang
 			if (language_codes_to_whisper.count(gf->target_lang) > 0) {
 				gf->whisper_params.language =
 					language_codes_to_whisper[gf->target_lang].c_str();
+				gf->whisper_params.detect_language = false;
 			} else {
 				gf->whisper_params.language = "auto";
+				gf->whisper_params.detect_language = true;
 			}
 		}
-		gf->whisper_params.initial_prompt =
-			obs_data_get_string(s, "initial_prompt") != nullptr
-				? obs_data_get_string(s, "initial_prompt")
-				: "";
-		gf->whisper_params.n_threads = (int)obs_data_get_int(s, "n_threads");
-		gf->whisper_params.n_max_text_ctx = (int)obs_data_get_int(s, "n_max_text_ctx");
-		gf->whisper_params.translate = obs_data_get_bool(s, "whisper_translate");
-		gf->whisper_params.no_context = obs_data_get_bool(s, "no_context");
-		gf->whisper_params.single_segment = obs_data_get_bool(s, "single_segment");
-		gf->whisper_params.print_special = obs_data_get_bool(s, "print_special");
-		gf->whisper_params.print_progress = obs_data_get_bool(s, "print_progress");
-		gf->whisper_params.print_realtime = obs_data_get_bool(s, "print_realtime");
-		gf->whisper_params.print_timestamps = obs_data_get_bool(s, "print_timestamps");
-		gf->whisper_params.token_timestamps = obs_data_get_bool(s, "token_timestamps");
-		gf->whisper_params.thold_pt = (float)obs_data_get_double(s, "thold_pt");
-		gf->whisper_params.thold_ptsum = (float)obs_data_get_double(s, "thold_ptsum");
-		gf->whisper_params.max_len = (int)obs_data_get_int(s, "max_len");
-		gf->whisper_params.split_on_word = obs_data_get_bool(s, "split_on_word");
-		gf->whisper_params.max_tokens = (int)obs_data_get_int(s, "max_tokens");
-		gf->whisper_params.suppress_blank = obs_data_get_bool(s, "suppress_blank");
-		gf->whisper_params.suppress_non_speech_tokens =
-			obs_data_get_bool(s, "suppress_non_speech_tokens");
-		gf->whisper_params.temperature = (float)obs_data_get_double(s, "temperature");
-		gf->whisper_params.max_initial_ts = (float)obs_data_get_double(s, "max_initial_ts");
-		gf->whisper_params.length_penalty = (float)obs_data_get_double(s, "length_penalty");
-		gf->whisper_params.no_timestamps = true;
 
 		if (gf->vad) {
 			const float vad_threshold = (float)obs_data_get_double(s, "vad_threshold");
diff --git a/src/whisper-utils/vad-processing.cpp b/src/whisper-utils/vad-processing.cpp
index 493c89c..a414931 100644
--- a/src/whisper-utils/vad-processing.cpp
+++ b/src/whisper-utils/vad-processing.cpp
@@ -10,6 +10,17 @@
 #include <Windows.h>
 #endif
 
+/**
+ * @brief Extracts audio data from the buffer, resamples it, and updates timestamp offsets.
+ *
+ * This function extracts audio data from the input buffer, resamples it to 16kHz, and updates
+ * gf->resampled_buffer with the resampled data.
+ *
+ * @param gf Pointer to the transcription filter data structure.
+ * @param start_timestamp_offset_ns Reference to the start timestamp offset in nanoseconds.
+ * @param end_timestamp_offset_ns Reference to the end timestamp offset in nanoseconds.
+ * @return Returns 0 on success, 1 if the input buffer is empty.
+ */
 int get_data_from_buf_and_resample(transcription_filter_data *gf,
 				   uint64_t &start_timestamp_offset_ns,
 				   uint64_t &end_timestamp_offset_ns)
@@ -111,6 +122,83 @@ int get_data_from_buf_and_resample(transcription_filter_data *gf,
 	return 0;
 }
 
+vad_state vad_disabled_segmentation(transcription_filter_data *gf, vad_state last_vad_state)
+{
+	// get data from buffer and resample
+	uint64_t start_timestamp_offset_ns = 0;
+	uint64_t end_timestamp_offset_ns = 0;
+
+	const int ret = get_data_from_buf_and_resample(gf, start_timestamp_offset_ns,
+						       end_timestamp_offset_ns);
+	if (ret != 0) {
+		// if there's data on the whisper buffer - run inference as "final" segment
+		if (gf->whisper_buffer.size > 0) {
+			obs_log(gf->log_level,
+				"VAD disabled: no new input but whisper buffer has %lu bytes, run inference",
+				gf->whisper_buffer.size);
+			run_inference_and_callbacks(gf, last_vad_state.start_ts_offest_ms,
+						    last_vad_state.end_ts_offset_ms,
+						    VAD_STATE_WAS_OFF);
+		}
+		return last_vad_state;
+	}
+
+	// push the data into gf-whisper_buffer
+	circlebuf_push_back(&gf->whisper_buffer, gf->resampled_buffer.data,
+			    gf->resampled_buffer.size);
+	// clear the resampled buffer
+	circlebuf_pop_front(&gf->resampled_buffer, nullptr, gf->resampled_buffer.size);
+
+	const uint64_t whisper_buf_samples = gf->whisper_buffer.size / sizeof(float);
+	const bool is_partial_segment =
+		whisper_buf_samples < (uint64_t)(gf->segment_duration * WHISPER_SAMPLE_RATE / 1000);
+
+#ifdef LOCALVOCAL_EXTRA_VERBOSE
+	obs_log(gf->log_level,
+		"VAD disabled: total %d frames (%lu bytes) in whisper buffer, state was %s new state is %s",
+		whisper_buf_samples, gf->whisper_buffer.size, last_vad_state.vad_on ? "ON" : "OFF",
+		is_partial_segment ? "PARTIAL" : "OFF");
+#endif
+
+	const uint64_t end_ts_offset_ms = end_timestamp_offset_ns / 1000000;
+
+	if (is_partial_segment) {
+		// check if we need to send the partial segment to inference based on
+		// the last partial segment end timestamp
+		const uint64_t unprocessed_length_ms =
+			end_ts_offset_ms - last_vad_state.last_partial_segment_end_ts;
+		if (unprocessed_length_ms > (uint64_t)gf->partial_latency) {
+			if (gf->partial_transcription) {
+				obs_log(gf->log_level,
+					"VAD disabled: partial segment with %lu ms unprocessed audio. start %lu, end %lu",
+					unprocessed_length_ms, last_vad_state.start_ts_offest_ms,
+					end_ts_offset_ms);
+				// Send to inference
+				run_inference_and_callbacks(gf, last_vad_state.start_ts_offest_ms,
+							    end_ts_offset_ms, VAD_STATE_PARTIAL);
+			} else {
+				obs_log(gf->log_level,
+					"VAD disabled: partial segment with %lu ms unprocessed audio. start %lu, end %lu. Skipping.",
+					unprocessed_length_ms, last_vad_state.start_ts_offest_ms,
+					end_ts_offset_ms);
+			}
+			// update the last partial segment end timestamp
+			last_vad_state.last_partial_segment_end_ts = end_ts_offset_ms;
+		}
+
+		return {false, last_vad_state.start_ts_offest_ms, end_ts_offset_ms,
+			last_vad_state.last_partial_segment_end_ts};
+	} else {
+		obs_log(gf->log_level,
+			"VAD disabled: full segment end -> send to inference. start %lu, end %lu",
+			last_vad_state.start_ts_offest_ms, end_ts_offset_ms);
+		// send the entire buffer to inference
+		run_inference_and_callbacks(gf, last_vad_state.start_ts_offest_ms, end_ts_offset_ms,
+					    VAD_STATE_WAS_OFF);
+		return {false, end_ts_offset_ms, end_ts_offset_ms, end_ts_offset_ms};
+	}
+}
+
 vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_vad_state)
 {
 	// get data from buffer and resample
diff --git a/src/whisper-utils/vad-processing.h b/src/whisper-utils/vad-processing.h
index 996002b..16a5992 100644
--- a/src/whisper-utils/vad-processing.h
+++ b/src/whisper-utils/vad-processing.h
@@ -1,9 +1,48 @@
 #ifndef VAD_PROCESSING_H
 #define VAD_PROCESSING_H
 
+/**
+ * @file vad-processing.h
+ * @brief Header file for Voice Activity Detection (VAD) processing utilities.
+ *
+ * This file contains the declarations of enums, structs, and functions used for
+ * VAD processing in the transcription filter.
+ */
+
+/**
+ * @enum VadState
+ * @brief Enumeration of possible VAD states.
+ *
+ * - VAD_STATE_WAS_ON: VAD was previously on.
+ * - VAD_STATE_WAS_OFF: VAD was previously off.
+ * - VAD_STATE_IS_OFF: VAD is currently off.
+ * - VAD_STATE_PARTIAL: VAD is in a partial state.
+ */
 enum VadState { VAD_STATE_WAS_ON = 0, VAD_STATE_WAS_OFF, VAD_STATE_IS_OFF, VAD_STATE_PARTIAL };
+
+/**
+ * @enum VadMode
+ * @brief Enumeration of possible VAD modes.
+ *
+ * - VAD_MODE_ACTIVE: VAD is actively processing.
+ * - VAD_MODE_HYBRID: VAD is in hybrid mode.
+ * - VAD_MODE_DISABLED: VAD is disabled.
+ */
 enum VadMode { VAD_MODE_ACTIVE = 0, VAD_MODE_HYBRID, VAD_MODE_DISABLED };
 
+/**
+ * @struct vad_state
+ * @brief Structure representing the state of VAD.
+ *
+ * @var vad_state::vad_on
+ * Indicates whether VAD is currently on.
+ * @var vad_state::start_ts_offest_ms
+ * Timestamp offset in milliseconds when VAD started.
+ * @var vad_state::end_ts_offset_ms
+ * Timestamp offset in milliseconds when VAD ended.
+ * @var vad_state::last_partial_segment_end_ts
+ * Timestamp of the end of the last partial segment.
+ */
 struct vad_state {
 	bool vad_on;
 	uint64_t start_ts_offest_ms;
@@ -11,6 +50,7 @@ struct vad_state {
 	uint64_t last_partial_segment_end_ts;
 };
 
+vad_state vad_disabled_segmentation(transcription_filter_data *gf, vad_state last_vad_state);
 vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_vad_state);
 vad_state hybrid_vad_segmentation(transcription_filter_data *gf, vad_state last_vad_state);
 void initialize_vad(transcription_filter_data *gf, const char *silero_vad_model_file);
diff --git a/src/whisper-utils/whisper-params.cpp b/src/whisper-utils/whisper-params.cpp
new file mode 100644
index 0000000..883e2af
--- /dev/null
+++ b/src/whisper-utils/whisper-params.cpp
@@ -0,0 +1,192 @@
+#include "whisper-params.h"
+
+#include <obs-module.h>
+
+#define MT_ obs_module_text
+
+void whisper_params_pretty_print(whisper_full_params &params)
+{
+	obs_log(LOG_INFO, "Whisper params:");
+	obs_log(LOG_INFO, "strategy: %s",
+		params.strategy == whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH
+			? "beam_search"
+			: "greedy");
+	obs_log(LOG_INFO, "n_threads: %d", params.n_threads);
+	obs_log(LOG_INFO, "n_max_text_ctx: %d", params.n_max_text_ctx);
+	obs_log(LOG_INFO, "offset_ms: %d", params.offset_ms);
+	obs_log(LOG_INFO, "duration_ms: %d", params.duration_ms);
+	obs_log(LOG_INFO, "translate: %s", params.translate ? "true" : "false");
+	obs_log(LOG_INFO, "no_context: %s", params.no_context ? "true" : "false");
+	obs_log(LOG_INFO, "no_timestamps: %s", params.no_timestamps ? "true" : "false");
+	obs_log(LOG_INFO, "single_segment: %s", params.single_segment ? "true" : "false");
+	obs_log(LOG_INFO, "print_special: %s", params.print_special ? "true" : "false");
+	obs_log(LOG_INFO, "print_progress: %s", params.print_progress ? "true" : "false");
+	obs_log(LOG_INFO, "print_realtime: %s", params.print_realtime ? "true" : "false");
+	obs_log(LOG_INFO, "print_timestamps: %s", params.print_timestamps ? "true" : "false");
+	obs_log(LOG_INFO, "token_timestamps: %s", params.token_timestamps ? "true" : "false");
+	obs_log(LOG_INFO, "thold_pt: %f", params.thold_pt);
+	obs_log(LOG_INFO, "thold_ptsum: %f", params.thold_ptsum);
+	obs_log(LOG_INFO, "max_len: %d", params.max_len);
+	obs_log(LOG_INFO, "split_on_word: %s", params.split_on_word ? "true" : "false");
+	obs_log(LOG_INFO, "max_tokens: %d", params.max_tokens);
+	obs_log(LOG_INFO, "debug_mode: %s", params.debug_mode ? "true" : "false");
+	obs_log(LOG_INFO, "audio_ctx: %d", params.audio_ctx);
+	obs_log(LOG_INFO, "tdrz_enable: %s", params.tdrz_enable ? "true" : "false");
+	obs_log(LOG_INFO, "suppress_regex: %s", params.suppress_regex);
+	obs_log(LOG_INFO, "initial_prompt: %s", params.initial_prompt);
+	obs_log(LOG_INFO, "language: %s", params.language);
+	obs_log(LOG_INFO, "detect_language: %s", params.detect_language ? "true" : "false");
+	obs_log(LOG_INFO, "suppress_blank: %s", params.suppress_blank ? "true" : "false");
+	obs_log(LOG_INFO, "suppress_non_speech_tokens: %s",
+		params.suppress_non_speech_tokens ? "true" : "false");
+	obs_log(LOG_INFO, "temperature: %f", params.temperature);
+	obs_log(LOG_INFO, "max_initial_ts: %f", params.max_initial_ts);
+	obs_log(LOG_INFO, "length_penalty: %f", params.length_penalty);
+	obs_log(LOG_INFO, "temperature_inc: %f", params.temperature_inc);
+	obs_log(LOG_INFO, "entropy_thold: %f", params.entropy_thold);
+	obs_log(LOG_INFO, "logprob_thold: %f", params.logprob_thold);
+	obs_log(LOG_INFO, "no_speech_thold: %f", params.no_speech_thold);
+	obs_log(LOG_INFO, "greedy.best_of: %d", params.greedy.best_of);
+	obs_log(LOG_INFO, "beam_search.beam_size: %d", params.beam_search.beam_size);
+	obs_log(LOG_INFO, "beam_search.patience: %f", params.beam_search.patience);
+}
+
+void apply_whisper_params_defaults_on_settings(obs_data_t *s)
+{
+	whisper_full_params whisper_params_tmp = whisper_full_default_params(
+		whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH);
+
+	obs_data_set_default_int(s, "strategy",
+				 whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH);
+	obs_data_set_default_int(s, "n_threads", whisper_params_tmp.n_threads);
+	obs_data_set_default_int(s, "n_max_text_ctx", whisper_params_tmp.n_max_text_ctx);
+	obs_data_set_default_int(s, "offset_ms", whisper_params_tmp.offset_ms);
+	obs_data_set_default_int(s, "duration_ms", whisper_params_tmp.duration_ms);
+	obs_data_set_default_bool(s, "whisper_translate", whisper_params_tmp.translate);
+	obs_data_set_default_bool(s, "no_context", whisper_params_tmp.no_context);
+	obs_data_set_default_bool(s, "no_timestamps", whisper_params_tmp.no_timestamps);
+	obs_data_set_default_bool(s, "single_segment", whisper_params_tmp.single_segment);
+	obs_data_set_default_bool(s, "print_special", false);
+	obs_data_set_default_bool(s, "print_progress", false);
+	obs_data_set_default_bool(s, "print_realtime", false);
+	obs_data_set_default_bool(s, "print_timestamps", false);
+	obs_data_set_default_bool(s, "token_timestamps", whisper_params_tmp.token_timestamps);
+	obs_data_set_default_double(s, "thold_pt", whisper_params_tmp.thold_pt);
+	obs_data_set_default_double(s, "thold_ptsum", whisper_params_tmp.thold_ptsum);
+	obs_data_set_default_int(s, "max_len", whisper_params_tmp.max_len);
+	obs_data_set_default_bool(s, "split_on_word", whisper_params_tmp.split_on_word);
+	obs_data_set_default_int(s, "max_tokens", whisper_params_tmp.max_tokens);
+	obs_data_set_default_bool(s, "debug_mode", whisper_params_tmp.debug_mode);
+	obs_data_set_default_int(s, "audio_ctx", whisper_params_tmp.audio_ctx);
+	obs_data_set_default_bool(s, "tdrz_enable", whisper_params_tmp.tdrz_enable);
+	obs_data_set_default_string(s, "suppress_regex", whisper_params_tmp.suppress_regex);
+	obs_data_set_default_string(s, "initial_prompt", whisper_params_tmp.initial_prompt);
+	// obs_data_set_default_string(s, "language", whisper_params_tmp.language);
+	obs_data_set_default_bool(s, "detect_language", whisper_params_tmp.detect_language);
+	obs_data_set_default_bool(s, "suppress_blank", false);
+	obs_data_set_default_bool(s, "suppress_non_speech_tokens", false);
+	obs_data_set_default_double(s, "temperature", whisper_params_tmp.temperature);
+	obs_data_set_default_double(s, "max_initial_ts", whisper_params_tmp.max_initial_ts);
+	obs_data_set_default_double(s, "length_penalty", whisper_params_tmp.length_penalty);
+	obs_data_set_default_double(s, "temperature_inc", whisper_params_tmp.temperature_inc);
+	obs_data_set_default_double(s, "entropy_thold", whisper_params_tmp.entropy_thold);
+	obs_data_set_default_double(s, "logprob_thold", whisper_params_tmp.logprob_thold);
+	obs_data_set_default_double(s, "no_speech_thold", whisper_params_tmp.no_speech_thold);
+	obs_data_set_default_int(s, "greedy.best_of", whisper_params_tmp.greedy.best_of);
+	obs_data_set_default_int(s, "beam_search.beam_size",
+				 whisper_params_tmp.beam_search.beam_size);
+	obs_data_set_default_double(s, "beam_search.patience",
+				    whisper_params_tmp.beam_search.patience);
+}
+
+void apply_whisper_params_from_settings(whisper_full_params &params, obs_data_t *settings)
+{
+	params = whisper_full_default_params(
+		(whisper_sampling_strategy)obs_data_get_int(settings, "strategy"));
+	params.n_threads = (int)obs_data_get_int(settings, "n_threads");
+	params.n_max_text_ctx = (int)obs_data_get_int(settings, "n_max_text_ctx");
+	params.offset_ms = (int)obs_data_get_int(settings, "offset_ms");
+	params.duration_ms = (int)obs_data_get_int(settings, "duration_ms");
+	params.translate = obs_data_get_bool(settings, "whisper_translate");
+	params.no_context = obs_data_get_bool(settings, "no_context");
+	params.no_timestamps = obs_data_get_bool(settings, "no_timestamps");
+	params.single_segment = obs_data_get_bool(settings, "single_segment");
+	params.print_special = obs_data_get_bool(settings, "print_special");
+	params.print_progress = obs_data_get_bool(settings, "print_progress");
+	params.print_realtime = obs_data_get_bool(settings, "print_realtime");
+	params.print_timestamps = obs_data_get_bool(settings, "print_timestamps");
+	params.token_timestamps = obs_data_get_bool(settings, "token_timestamps");
+	params.thold_pt = (float)obs_data_get_double(settings, "thold_pt");
+	params.thold_ptsum = (float)obs_data_get_double(settings, "thold_ptsum");
+	params.max_len = (int)obs_data_get_int(settings, "max_len");
+	params.split_on_word = obs_data_get_bool(settings, "split_on_word");
+	params.max_tokens = (int)obs_data_get_int(settings, "max_tokens");
+	params.debug_mode = obs_data_get_bool(settings, "debug_mode");
+	params.audio_ctx = (int)obs_data_get_int(settings, "audio_ctx");
+	params.tdrz_enable = obs_data_get_bool(settings, "tdrz_enable");
+	params.suppress_regex = obs_data_get_string(settings, "suppress_regex");
+	params.initial_prompt = obs_data_get_string(settings, "initial_prompt");
+	// params.language = obs_data_get_string(settings, "language");
+	params.detect_language = obs_data_get_bool(settings, "detect_language");
+	params.suppress_blank = obs_data_get_bool(settings, "suppress_blank");
+	params.suppress_non_speech_tokens =
+		obs_data_get_bool(settings, "suppress_non_speech_tokens");
+	params.temperature = (float)obs_data_get_double(settings, "temperature");
+	params.max_initial_ts = (float)obs_data_get_double(settings, "max_initial_ts");
+	params.length_penalty = (float)obs_data_get_double(settings, "length_penalty");
+	params.temperature_inc = (float)obs_data_get_double(settings, "temperature_inc");
+	params.entropy_thold = (float)obs_data_get_double(settings, "entropy_thold");
+	params.logprob_thold = (float)obs_data_get_double(settings, "logprob_thold");
+	params.no_speech_thold = (float)obs_data_get_double(settings, "no_speech_thold");
+	params.greedy.best_of = (int)obs_data_get_int(settings, "greedy.best_of");
+	params.beam_search.beam_size = (int)obs_data_get_int(settings, "beam_search.beam_size");
+	params.beam_search.patience = (float)obs_data_get_double(settings, "beam_search.patience");
+}
+
+void add_whisper_params_group_properties(obs_properties_t *ppts)
+{
+	obs_properties_t *g = obs_properties_create();
+	obs_properties_add_group(ppts, "whisper_params_group", MT_("whisper_parameters"),
+				 OBS_GROUP_NORMAL, g);
+
+	obs_properties_add_list(g, "strategy", MT_("whisper_sampling_strategy"),
+				OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_INT);
+	obs_properties_add_int(g, "n_threads", MT_("n_threads"), 1, 8, 1);
+	obs_properties_add_int(g, "n_max_text_ctx", MT_("n_max_text_ctx"), 1, 100, 1);
+	obs_properties_add_int(g, "offset_ms", MT_("offset_ms"), 0, 10000, 100);
+	obs_properties_add_int(g, "duration_ms", MT_("duration_ms"), 0, 30000, 500);
+	obs_properties_add_bool(g, "whisper_translate", MT_("whisper_translate"));
+	obs_properties_add_bool(g, "no_context", MT_("no_context"));
+	obs_properties_add_bool(g, "no_timestamps", MT_("no_timestamps"));
+	obs_properties_add_bool(g, "single_segment", MT_("single_segment"));
+	obs_properties_add_bool(g, "print_special", MT_("print_special"));
+	obs_properties_add_bool(g, "print_progress", MT_("print_progress"));
+	obs_properties_add_bool(g, "print_realtime", MT_("print_realtime"));
+	obs_properties_add_bool(g, "print_timestamps", MT_("print_timestamps"));
+	obs_properties_add_bool(g, "token_timestamps", MT_("token_timestamps"));
+	obs_properties_add_float(g, "thold_pt", MT_("thold_pt"), 0, 1, 0.05);
+	obs_properties_add_float(g, "thold_ptsum", MT_("thold_ptsum"), 0, 1, 0.05);
+	obs_properties_add_int(g, "max_len", MT_("max_len"), 0, 1000, 1);
+	obs_properties_add_bool(g, "split_on_word", MT_("split_on_word"));
+	obs_properties_add_int(g, "max_tokens", MT_("max_tokens"), 0, 1000, 1);
+	obs_properties_add_bool(g, "debug_mode", MT_("debug_mode"));
+	obs_properties_add_int(g, "audio_ctx", MT_("audio_ctx"), 0, 10, 1);
+	obs_properties_add_bool(g, "tdrz_enable", MT_("tdrz_enable"));
+	obs_properties_add_text(g, "suppress_regex", MT_("suppress_regex"), OBS_TEXT_DEFAULT);
+	obs_properties_add_text(g, "initial_prompt", MT_("initial_prompt"), OBS_TEXT_DEFAULT);
+	// obs_properties_add_text(g, "language", MT_("language"), OBS_TEXT_DEFAULT);
+	obs_properties_add_bool(g, "detect_language", MT_("detect_language"));
+	obs_properties_add_bool(g, "suppress_blank", MT_("suppress_blank"));
+	obs_properties_add_bool(g, "suppress_non_speech_tokens", MT_("suppress_non_speech_tokens"));
+	obs_properties_add_float(g, "temperature", MT_("temperature"), 0, 1, 0.05);
+	obs_properties_add_float(g, "max_initial_ts", MT_("max_initial_ts"), 0, 100, 1);
+	obs_properties_add_float(g, "length_penalty", MT_("length_penalty"), 0, 1, 0.05);
+	obs_properties_add_float(g, "temperature_inc", MT_("temperature_inc"), 0, 1, 0.05);
+	obs_properties_add_float(g, "entropy_thold", MT_("entropy_thold"), 0, 1, 0.05);
+	obs_properties_add_float(g, "logprob_thold", MT_("logprob_thold"), 0, 1, 0.05);
+	obs_properties_add_float(g, "no_speech_thold", MT_("no_speech_thold"), 0, 1, 0.05);
+	obs_properties_add_int(g, "greedy.best_of", MT_("greedy.best_of"), 1, 10, 1);
+	obs_properties_add_int(g, "beam_search.beam_size", MT_("beam_search.beam_size"), 1, 10, 1);
+	obs_properties_add_float(g, "beam_search.patience", MT_("beam_search.patience"), 0, 1,
+				 0.05);
+}
diff --git a/src/whisper-utils/whisper-params.h b/src/whisper-utils/whisper-params.h
new file mode 100644
index 0000000..2106b2f
--- /dev/null
+++ b/src/whisper-utils/whisper-params.h
@@ -0,0 +1,52 @@
+#ifndef WHISPER_PARAMS_H
+#define WHISPER_PARAMS_H
+
+#include "transcription-filter-data.h"
+
+/**
+ * @brief Prints the whisper parameters in a human-readable format.
+ *
+ * This function outputs the whisper parameters to the console in a formatted
+ * and readable manner.
+ *
+ * @param params Reference to the whisper_full_params structure.
+ */
+void whisper_params_pretty_print(whisper_full_params &params);
+
+/**
+ * @brief Applies default whisper parameters to the given settings.
+ *
+ * This function sets the default values for whisper parameters on the provided
+ * OBS data settings object. It ensures that all necessary parameters have
+ * their default values, which can be used as a baseline for further
+ * customization.
+ *
+ * @param s A pointer to an obs_data_t structure representing the settings
+ *          where the default whisper parameters will be applied.
+ */
+void apply_whisper_params_defaults_on_settings(obs_data_t *s);
+
+/**
+ * @brief Applies whisper parameters from the given settings.
+ *
+ * This function takes a reference to a `whisper_full_params` structure and an
+ * `obs_data_t` settings object, and applies the settings to the whisper parameters.
+ *
+ * @param params A reference to the `whisper_full_params` structure that will be modified.
+ * @param settings A pointer to the `obs_data_t` settings object containing the parameters to apply.
+ */
+void apply_whisper_params_from_settings(whisper_full_params &params, obs_data_t *settings);
+
+/**
+ * @brief Adds whisper parameters group properties to the given OBS properties object.
+ *
+ * This function adds a group of properties related to whisper parameters to the
+ * specified OBS properties object. These properties can be used to configure
+ * whisper-related settings in the OBS application.
+ *
+ * @param ppts A pointer to an OBS properties object where the whisper parameters
+ *             group properties will be added.
+ */
+void add_whisper_params_group_properties(obs_properties_t *ppts);
+
+#endif // WHISPER_PARAMS_H
diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp
index 3518edf..b53c5d4 100644
--- a/src/whisper-utils/whisper-processing.cpp
+++ b/src/whisper-utils/whisper-processing.cpp
@@ -198,10 +198,23 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
 		obs_log(gf->log_level, "Initial prompt: %s", gf->whisper_params.initial_prompt);
 	}
 
+	obs_log(gf->log_level, "Running whisper inference. single segment? %s",
+		gf->whisper_params.single_segment ? "yes" : "no");
+
 	// run the inference
 	int whisper_full_result = -1;
 	gf->whisper_params.duration_ms = (int)(whisper_duration_ms);
 	try {
+		// whisper_full_params whisper_params_tmp = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH);
+		// whisper_params_tmp.language = gf->whisper_params.language;
+		// gf->whisper_params.no_timestamps = false;
+		// whisper_params_tmp.print_progress = false;
+		// whisper_params_tmp.print_timestamps = false;
+		// whisper_params_tmp.split_on_word = true;
+		// whisper_params_tmp.max_tokens = 100;
+		// whisper_params_tmp.suppress_blank = false;
+		// whisper_params_pretty_print(gf->whisper_params);
+		// whisper_params_pretty_print(whisper_params_tmp);
 		whisper_full_result = whisper_full(gf->whisper_context, gf->whisper_params,
 						   pcm32f_data, (int)pcm32f_size);
 	} catch (const std::exception &e) {
@@ -355,7 +368,7 @@ void whisper_loop(void *data)
 
 	obs_log(gf->log_level, "Starting whisper thread");
 
-	vad_state current_vad_state = {false, now_ms(), 0, 0};
+	vad_state current_vad_state = {false, 0, 0, 0};
 
 	const char *whisper_loop_name = "Whisper loop";
 	profile_register_root(whisper_loop_name, 50 * 1000 * 1000);
@@ -377,6 +390,8 @@ void whisper_loop(void *data)
 			current_vad_state = hybrid_vad_segmentation(gf, current_vad_state);
 		} else if (gf->vad_mode == VAD_MODE_ACTIVE) {
 			current_vad_state = vad_based_segmentation(gf, current_vad_state);
+		} else if (gf->vad_mode == VAD_MODE_DISABLED) {
+			current_vad_state = vad_disabled_segmentation(gf, current_vad_state);
 		}
 
 		if (!gf->cleared_last_sub) {
@@ -399,7 +414,7 @@ void whisper_loop(void *data)
 		// or if the whisper context is null
 		std::unique_lock<std::mutex> lock(gf->whisper_ctx_mutex);
 		if (gf->input_buffers->size == 0) {
-			gf->wshiper_thread_cv.wait_for(lock, std::chrono::milliseconds(50));
+			gf->wshiper_thread_cv.wait_for(lock, std::chrono::milliseconds(250));
 		}
 	}
 
diff --git a/src/whisper-utils/whisper-utils.h b/src/whisper-utils/whisper-utils.h
index c62168b..b55a1ae 100644
--- a/src/whisper-utils/whisper-utils.h
+++ b/src/whisper-utils/whisper-utils.h
@@ -1,25 +1,81 @@
+/**
+ * @file whisper-utils.h
+ * @brief Utility functions for handling whisper transcription operations.
+ *
+ * This header file contains declarations for various utility functions used
+ * in the whisper transcription process, including thread management, sequence
+ * operations, and timestamp formatting.
+ *
+ * @note The timestamp conversion function is adapted from the whisper.cpp project.
+ *
+ * @see transcription-filter-data.h
+ */
 #ifndef WHISPER_UTILS_H
 #define WHISPER_UTILS_H
 
 #include "transcription-filter-data.h"
 
 #include <string>
+#include <vector>
 
+/**
+ * @brief Shuts down the whisper thread.
+ *
+ * This function terminates the whisper thread associated with the given
+ * transcription filter data.
+ *
+ * @param gf Pointer to the transcription filter data structure.
+ */
 void shutdown_whisper_thread(struct transcription_filter_data *gf);
+
+/**
+ * @brief Starts the whisper thread with a specified path.
+ *
+ * This function initializes and starts the whisper thread using the provided
+ * transcription filter data, path, and Silero VAD model file.
+ *
+ * @param gf Pointer to the transcription filter data structure.
+ * @param path Reference to a string containing the path.
+ * @param silero_vad_model_file Pointer to a character array containing the Silero VAD model file.
+ */
 void start_whisper_thread_with_path(struct transcription_filter_data *gf, const std::string &path,
 				    const char *silero_vad_model_file);
 
+/**
+ * @brief Finds the start of overlap between two sequences.
+ *
+ * This function compares two sequences of whisper token data and determines
+ * the starting indices of their overlap.
+ *
+ * @param seq1 Reference to the first sequence of whisper token data.
+ * @param seq2 Reference to the second sequence of whisper token data.
+ * @return std::pair<int, int> A pair of integers representing the starting indices of the overlap in seq1 and seq2.
+ */
 std::pair<int, int> findStartOfOverlap(const std::vector<whisper_token_data> &seq1,
 				       const std::vector<whisper_token_data> &seq2);
+
+/**
+ * @brief Reconstructs a sentence from two sequences.
+ *
+ * This function merges two sequences of whisper token data to reconstruct a
+ * complete sentence.
+ *
+ * @param seq1 Reference to the first sequence of whisper token data.
+ * @param seq2 Reference to the second sequence of whisper token data.
+ * @return std::vector<whisper_token_data> A vector containing the reconstructed sentence.
+ */
 std::vector<whisper_token_data> reconstructSentence(const std::vector<whisper_token_data> &seq1,
 						    const std::vector<whisper_token_data> &seq2);
 
 /**
- * @brief Convert a timestamp in milliseconds to a string in the format "MM:SS.sss" .
- * Taken from https://github.com/ggerganov/whisper.cpp/blob/master/examples/stream/stream.cpp
- * @param t_ms_offset Timestamp in milliseconds (offset from the beginning of the stream)
- * @return std::string Timestamp in the format "MM:SS.sss"
+ * @brief Converts a timestamp in milliseconds to a string in the format "MM:SS.sss".
+ *
+ * This function takes a timestamp in milliseconds and converts it to a string
+ * representation in the format "MM:SS.sss".
+ *
+ * @param t_ms_offset Timestamp in milliseconds (offset from the beginning of the stream).
+ * @return std::string Timestamp in the format "MM:SS.sss".
  */
 std::string to_timestamp(uint64_t t_ms_offset);
 
-#endif /* WHISPER_UTILS_H */
+#endif // WHISPER_UTILS_H