From a32e3274777ac0bf00c4679e0d351657569e45ba Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Sun, 24 Nov 2024 20:28:46 -0800 Subject: [PATCH] No VAD segmentation option (#182) * Add support for disabled VAD mode and enhance CMake configuration * Enhance VAD processing and transcription filter data structure with additional comments and logic adjustments * Enhance VAD processing with improved logging, adjust single segment default, and update inference handling * Refactor whisper parameter handling and enhance utility functions for better clarity and maintainability * Add whisper parameters group properties and clean up related code * Add whisper parameters handling and update related files for improved functionality * Refactor whisper parameter type casting for improved clarity and consistency * trigger build * Fix logging message to use the correct variable for saved sentence --- CMakeLists.txt | 8 +- data/locale/en-US.ini | 61 ++++--- src/transcription-filter-callbacks.cpp | 4 +- src/transcription-filter-data.h | 1 + src/transcription-filter-properties.cpp | 104 +----------- src/transcription-filter.cpp | 43 ++--- src/whisper-utils/vad-processing.cpp | 88 +++++++++++ src/whisper-utils/vad-processing.h | 40 +++++ src/whisper-utils/whisper-params.cpp | 192 +++++++++++++++++++++++ src/whisper-utils/whisper-params.h | 52 ++++++ src/whisper-utils/whisper-processing.cpp | 19 ++- src/whisper-utils/whisper-utils.h | 66 +++++++- 12 files changed, 515 insertions(+), 163 deletions(-) create mode 100644 src/whisper-utils/whisper-params.cpp create mode 100644 src/whisper-utils/whisper-params.h diff --git a/CMakeLists.txt b/CMakeLists.txt index c18ca66..4ccb91d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,7 +59,7 @@ if(WIN32) "cpu" CACHE STRING "Acceleration to use") endif() - set_property(CACHE ACCELERATION PROPERTY STRINGS "cpu" "hipblas" "cuda") + set_property(CACHE ACCELERATION PROPERTY STRINGS "cpu" "hipblas" "cuda" "vulkan") endif() include(cmake/BuildWhispercpp.cmake) @@ -101,6 +101,11 @@ include(cmake/BuildICU.cmake) target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE ICU) target_include_directories(${CMAKE_PROJECT_NAME} SYSTEM PUBLIC ${ICU_INCLUDE_DIR}) +# check env var for extra verbose logging +if(DEFINED ENV{LOCALVOCAL_EXTRA_VERBOSE}) + target_compile_definitions(${CMAKE_PROJECT_NAME} PRIVATE LOCALVOCAL_EXTRA_VERBOSE) +endif() + target_sources( ${CMAKE_PROJECT_NAME} PRIVATE src/plugin-main.c @@ -117,6 +122,7 @@ target_sources( src/whisper-utils/whisper-processing.cpp src/whisper-utils/whisper-utils.cpp src/whisper-utils/whisper-model-utils.cpp + src/whisper-utils/whisper-params.cpp src/whisper-utils/silero-vad-onnx.cpp src/whisper-utils/token-buffer-thread.cpp src/whisper-utils/vad-processing.cpp diff --git a/data/locale/en-US.ini b/data/locale/en-US.ini index 3f94d4e..0d326fb 100644 --- a/data/locale/en-US.ini +++ b/data/locale/en-US.ini @@ -13,30 +13,9 @@ external_model_file="External model file" whisper_parameters="Whisper Model Parameters" language="Input Language" whisper_sampling_method="Whisper Sampling Method" -n_threads="Number of threads" -n_max_text_ctx="Max text context" -translate="Translate" translate_local="Local Translation" translate_cloud="Cloud Translation" -no_context="No context" -single_segment="Single segment" -print_special="Print special" -print_progress="Print progress" -print_realtime="Print realtime" -print_timestamps="Print timestamps" -token_timestamps="Token timestamps" -thold_pt="Token prob. threshold" -thold_ptsum="Token sum prob. threshold" -max_len="Max length in chars" -split_on_word="Split on word" -max_tokens="Max tokens" speed_up="Speed up" -initial_prompt="Initial prompt" -suppress_blank="Suppress blank" -suppress_non_speech_tokens="Suppress non-speech tokens" -temperature="Temperature" -max_initial_ts="Max initial timestamps" -length_penalty="Length penalty" save_srt="Save in SRT format" truncate_output_file="Truncate file on new sentence" only_while_recording="Write output only while recording" @@ -92,11 +71,51 @@ partial_latency="Latency (ms)" vad_mode="VAD Mode" Active_VAD="Active VAD" Hybrid_VAD="Hybrid VAD" +No_VAD="No VAD" translate_only_full_sentences="Translate only full sentences" duration_filter_threshold="Duration filter" segment_duration="Segment duration" n_context_sentences="# Context sentences" max_sub_duration="Max. sub duration (ms)" +# Whisper model parameters +strategy="Strategy" +n_threads="Number of threads" +n_max_text_ctx="Max text context" +offset_ms="Offset (ms)" +duration_ms="Duration (ms)" +whisper_translate="Translate" +no_context="No context" +no_timestamps="No timestamps" +single_segment="Single segment" +print_special="Print special" +print_progress="Print progress" +print_realtime="Print realtime" +print_timestamps="Print timestamps" +token_timestamps="Token timestamps" +thold_pt="Token prob. threshold" +thold_ptsum="Token sum prob. threshold" +max_len="Max length in chars" +split_on_word="Split on word" +max_tokens="Max tokens" +debug_mode="Debug mode" +audio_ctx="Audio context" +tdrz_enable="Enable TDRZ" +suppress_regex="Suppress regex" +initial_prompt="Initial prompt" +language="Input Language" +detect_language="Detect language" +suppress_blank="Suppress blank" +suppress_non_speech_tokens="Suppress non-speech tokens" +temperature="Temperature" +max_initial_ts="Max initial timestamps" +length_penalty="Length penalty" +temperature_inc="Temperature increment" +entropy_thold="Entropy threshold" +logprob_thold="Logprob threshold" +no_speech_thold="No speech threshold" +greedy.best_of="Greedy best of" +beam_search.beam_size="Beam size" +beam_search.patience="Patience" Google-Cloud-Translation="Google Cloud Translation" Microsoft-Translator="Microsoft Azure Translator" Amazon-Translate="AWS Translate" diff --git a/src/transcription-filter-callbacks.cpp b/src/transcription-filter-callbacks.cpp index 5938863..98be9c7 100644 --- a/src/transcription-filter-callbacks.cpp +++ b/src/transcription-filter-callbacks.cpp @@ -143,7 +143,9 @@ void send_sentence_to_file(struct transcription_filter_data *gf, openmode |= std::ios::app; } if (!gf->save_srt) { - // Write raw sentence to file + obs_log(gf->log_level, "Saving sentence '%s' to file %s", sentence.c_str(), + gf->output_file_path.c_str()); + // Write raw sentence to text file (non-srt format) try { std::ofstream output_file(file_path, openmode); output_file << sentence << std::endl; diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h index f96c7d9..2134e1c 100644 --- a/src/transcription-filter-data.h +++ b/src/transcription-filter-data.h @@ -87,6 +87,7 @@ struct transcription_filter_data { bool partial_transcription = false; int partial_latency = 1000; float duration_filter_threshold = 2.25f; + // Duration of the target segment buffer in ms int segment_duration = 7000; // Cloud translation options diff --git a/src/transcription-filter-properties.cpp b/src/transcription-filter-properties.cpp index 7b9c2f4..cce2666 100644 --- a/src/transcription-filter-properties.cpp +++ b/src/transcription-filter-properties.cpp @@ -8,6 +8,7 @@ #include "transcription-filter-utils.h" #include "whisper-utils/whisper-language.h" #include "whisper-utils/vad-processing.h" +#include "whisper-utils/whisper-params.h" #include "model-utils/model-downloader-types.h" #include "translation/language_codes.h" #include "ui/filter-replace-dialog.h" @@ -15,6 +16,7 @@ #include #include +#include "whisper-utils/whisper-utils.h" bool translation_options_callback(obs_properties_t *props, obs_property_t *property, obs_data_t *settings) @@ -479,6 +481,7 @@ void add_advanced_group_properties(obs_properties_t *ppts, struct transcription_ obs_property_t *vad_mode_list = obs_properties_add_list(advanced_config_group, "vad_mode", MT_("vad_mode"), OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_INT); + obs_property_list_add_int(vad_mode_list, MT_("No_VAD"), VAD_MODE_DISABLED); obs_property_list_add_int(vad_mode_list, MT_("Active_VAD"), VAD_MODE_ACTIVE); obs_property_list_add_int(vad_mode_list, MT_("Hybrid_VAD"), VAD_MODE_HYBRID); // add vad threshold slider @@ -528,82 +531,6 @@ void add_logging_group_properties(obs_properties_t *ppts) obs_property_list_add_int(list, "WARNING", LOG_WARNING); } -void add_whisper_params_group_properties(obs_properties_t *ppts) -{ - obs_properties_t *whisper_params_group = obs_properties_create(); - obs_properties_add_group(ppts, "whisper_params_group", MT_("whisper_parameters"), - OBS_GROUP_NORMAL, whisper_params_group); - - obs_property_t *whisper_sampling_method_list = obs_properties_add_list( - whisper_params_group, "whisper_sampling_method", MT_("whisper_sampling_method"), - OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_INT); - obs_property_list_add_int(whisper_sampling_method_list, "Beam search", - WHISPER_SAMPLING_BEAM_SEARCH); - obs_property_list_add_int(whisper_sampling_method_list, "Greedy", WHISPER_SAMPLING_GREEDY); - - // add int slider for context sentences - obs_properties_add_int_slider(whisper_params_group, "n_context_sentences", - MT_("n_context_sentences"), 0, 5, 1); - - // int n_threads; - obs_properties_add_int_slider(whisper_params_group, "n_threads", MT_("n_threads"), 1, 8, 1); - // int n_max_text_ctx; // max tokens to use from past text as prompt for the decoder - obs_properties_add_int_slider(whisper_params_group, "n_max_text_ctx", MT_("n_max_text_ctx"), - 0, 16384, 100); - // int offset_ms; // start offset in ms - // int duration_ms; // audio duration to process in ms - // bool translate; - obs_properties_add_bool(whisper_params_group, "whisper_translate", - MT_("whisper_translate")); - // bool no_context; // do not use past transcription (if any) as initial prompt for the decoder - obs_properties_add_bool(whisper_params_group, "no_context", MT_("no_context")); - // bool single_segment; // force single segment output (useful for streaming) - obs_properties_add_bool(whisper_params_group, "single_segment", MT_("single_segment")); - // bool print_special; // print special tokens (e.g. , , , etc.) - obs_properties_add_bool(whisper_params_group, "print_special", MT_("print_special")); - // bool print_progress; // print progress information - obs_properties_add_bool(whisper_params_group, "print_progress", MT_("print_progress")); - // bool print_realtime; // print results from within whisper.cpp (avoid it, use callback instead) - obs_properties_add_bool(whisper_params_group, "print_realtime", MT_("print_realtime")); - // bool print_timestamps; // print timestamps for each text segment when printing realtime - obs_properties_add_bool(whisper_params_group, "print_timestamps", MT_("print_timestamps")); - // bool token_timestamps; // enable token-level timestamps - obs_properties_add_bool(whisper_params_group, "token_timestamps", MT_("token_timestamps")); - // enable DTW timestamps - obs_properties_add_bool(whisper_params_group, "dtw_token_timestamps", - MT_("dtw_token_timestamps")); - // float thold_pt; // timestamp token probability threshold (~0.01) - obs_properties_add_float_slider(whisper_params_group, "thold_pt", MT_("thold_pt"), 0.0f, - 1.0f, 0.05f); - // float thold_ptsum; // timestamp token sum probability threshold (~0.01) - obs_properties_add_float_slider(whisper_params_group, "thold_ptsum", MT_("thold_ptsum"), - 0.0f, 1.0f, 0.05f); - // int max_len; // max segment length in characters - obs_properties_add_int_slider(whisper_params_group, "max_len", MT_("max_len"), 0, 100, 1); - // bool split_on_word; // split on word rather than on token (when used with max_len) - obs_properties_add_bool(whisper_params_group, "split_on_word", MT_("split_on_word")); - // int max_tokens; // max tokens per segment (0 = no limit) - obs_properties_add_int_slider(whisper_params_group, "max_tokens", MT_("max_tokens"), 0, 100, - 1); - // const char * initial_prompt; - obs_properties_add_text(whisper_params_group, "initial_prompt", MT_("initial_prompt"), - OBS_TEXT_DEFAULT); - // bool suppress_blank - obs_properties_add_bool(whisper_params_group, "suppress_blank", MT_("suppress_blank")); - // bool suppress_non_speech_tokens - obs_properties_add_bool(whisper_params_group, "suppress_non_speech_tokens", - MT_("suppress_non_speech_tokens")); - // float temperature - obs_properties_add_float_slider(whisper_params_group, "temperature", MT_("temperature"), - 0.0f, 1.0f, 0.05f); - // float max_initial_ts - obs_properties_add_float_slider(whisper_params_group, "max_initial_ts", - MT_("max_initial_ts"), 0.0f, 1.0f, 0.05f); - // float length_penalty - obs_properties_add_float_slider(whisper_params_group, "length_penalty", - MT_("length_penalty"), -1.0f, 1.0f, 0.1f); -} - void add_general_group_properties(obs_properties_t *ppts) { // add "General" group @@ -742,28 +669,5 @@ void transcription_filter_defaults(obs_data_t *s) obs_data_set_default_string(s, "translate_cloud_region", "eastus"); // Whisper parameters - obs_data_set_default_int(s, "whisper_sampling_method", WHISPER_SAMPLING_BEAM_SEARCH); - obs_data_set_default_int(s, "n_context_sentences", 0); - obs_data_set_default_string(s, "initial_prompt", ""); - obs_data_set_default_int(s, "n_threads", 4); - obs_data_set_default_int(s, "n_max_text_ctx", 16384); - obs_data_set_default_bool(s, "whisper_translate", false); - obs_data_set_default_bool(s, "no_context", true); - obs_data_set_default_bool(s, "single_segment", true); - obs_data_set_default_bool(s, "print_special", false); - obs_data_set_default_bool(s, "print_progress", false); - obs_data_set_default_bool(s, "print_realtime", false); - obs_data_set_default_bool(s, "print_timestamps", false); - obs_data_set_default_bool(s, "token_timestamps", false); - obs_data_set_default_bool(s, "dtw_token_timestamps", false); - obs_data_set_default_double(s, "thold_pt", 0.01); - obs_data_set_default_double(s, "thold_ptsum", 0.01); - obs_data_set_default_int(s, "max_len", 0); - obs_data_set_default_bool(s, "split_on_word", true); - obs_data_set_default_int(s, "max_tokens", 50); - obs_data_set_default_bool(s, "suppress_blank", false); - obs_data_set_default_bool(s, "suppress_non_speech_tokens", false); - obs_data_set_default_double(s, "temperature", 0.1); - obs_data_set_default_double(s, "max_initial_ts", 1.0); - obs_data_set_default_double(s, "length_penalty", -1.0); + apply_whisper_params_defaults_on_settings(s); } diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp index 5e13d52..ebd765d 100644 --- a/src/transcription-filter.cpp +++ b/src/transcription-filter.cpp @@ -25,6 +25,7 @@ #include "whisper-utils/whisper-language.h" #include "whisper-utils/whisper-model-utils.h" #include "whisper-utils/whisper-utils.h" +#include "whisper-utils/whisper-params.h" #include "translation/language_codes.h" #include "translation/translation-utils.h" #include "translation/translation.h" @@ -364,51 +365,27 @@ void transcription_filter_update(void *data, obs_data_t *s) gf->sentence_psum_accept_thresh = (float)obs_data_get_double(s, "sentence_psum_accept_thresh"); - gf->whisper_params = whisper_full_default_params( - (whisper_sampling_strategy)obs_data_get_int(s, "whisper_sampling_method")); - gf->whisper_params.duration_ms = (int)obs_data_get_int(s, "buffer_size_msec"); + apply_whisper_params_from_settings(gf->whisper_params, s); + if (!new_translate || gf->translation_model_index != "whisper-based-translation") { const char *whisper_language_select = obs_data_get_string(s, "whisper_language_select"); - gf->whisper_params.language = (whisper_language_select != nullptr && - strlen(whisper_language_select) > 0) - ? whisper_language_select - : "auto"; + const bool language_selected = whisper_language_select != nullptr && + strlen(whisper_language_select) > 0; + gf->whisper_params.language = (language_selected) ? whisper_language_select + : "auto"; + gf->whisper_params.detect_language = !language_selected; } else { // take the language from gf->target_lang if (language_codes_to_whisper.count(gf->target_lang) > 0) { gf->whisper_params.language = language_codes_to_whisper[gf->target_lang].c_str(); + gf->whisper_params.detect_language = false; } else { gf->whisper_params.language = "auto"; + gf->whisper_params.detect_language = true; } } - gf->whisper_params.initial_prompt = - obs_data_get_string(s, "initial_prompt") != nullptr - ? obs_data_get_string(s, "initial_prompt") - : ""; - gf->whisper_params.n_threads = (int)obs_data_get_int(s, "n_threads"); - gf->whisper_params.n_max_text_ctx = (int)obs_data_get_int(s, "n_max_text_ctx"); - gf->whisper_params.translate = obs_data_get_bool(s, "whisper_translate"); - gf->whisper_params.no_context = obs_data_get_bool(s, "no_context"); - gf->whisper_params.single_segment = obs_data_get_bool(s, "single_segment"); - gf->whisper_params.print_special = obs_data_get_bool(s, "print_special"); - gf->whisper_params.print_progress = obs_data_get_bool(s, "print_progress"); - gf->whisper_params.print_realtime = obs_data_get_bool(s, "print_realtime"); - gf->whisper_params.print_timestamps = obs_data_get_bool(s, "print_timestamps"); - gf->whisper_params.token_timestamps = obs_data_get_bool(s, "token_timestamps"); - gf->whisper_params.thold_pt = (float)obs_data_get_double(s, "thold_pt"); - gf->whisper_params.thold_ptsum = (float)obs_data_get_double(s, "thold_ptsum"); - gf->whisper_params.max_len = (int)obs_data_get_int(s, "max_len"); - gf->whisper_params.split_on_word = obs_data_get_bool(s, "split_on_word"); - gf->whisper_params.max_tokens = (int)obs_data_get_int(s, "max_tokens"); - gf->whisper_params.suppress_blank = obs_data_get_bool(s, "suppress_blank"); - gf->whisper_params.suppress_non_speech_tokens = - obs_data_get_bool(s, "suppress_non_speech_tokens"); - gf->whisper_params.temperature = (float)obs_data_get_double(s, "temperature"); - gf->whisper_params.max_initial_ts = (float)obs_data_get_double(s, "max_initial_ts"); - gf->whisper_params.length_penalty = (float)obs_data_get_double(s, "length_penalty"); - gf->whisper_params.no_timestamps = true; if (gf->vad) { const float vad_threshold = (float)obs_data_get_double(s, "vad_threshold"); diff --git a/src/whisper-utils/vad-processing.cpp b/src/whisper-utils/vad-processing.cpp index 493c89c..a414931 100644 --- a/src/whisper-utils/vad-processing.cpp +++ b/src/whisper-utils/vad-processing.cpp @@ -10,6 +10,17 @@ #include #endif +/** + * @brief Extracts audio data from the buffer, resamples it, and updates timestamp offsets. + * + * This function extracts audio data from the input buffer, resamples it to 16kHz, and updates + * gf->resampled_buffer with the resampled data. + * + * @param gf Pointer to the transcription filter data structure. + * @param start_timestamp_offset_ns Reference to the start timestamp offset in nanoseconds. + * @param end_timestamp_offset_ns Reference to the end timestamp offset in nanoseconds. + * @return Returns 0 on success, 1 if the input buffer is empty. + */ int get_data_from_buf_and_resample(transcription_filter_data *gf, uint64_t &start_timestamp_offset_ns, uint64_t &end_timestamp_offset_ns) @@ -111,6 +122,83 @@ int get_data_from_buf_and_resample(transcription_filter_data *gf, return 0; } +vad_state vad_disabled_segmentation(transcription_filter_data *gf, vad_state last_vad_state) +{ + // get data from buffer and resample + uint64_t start_timestamp_offset_ns = 0; + uint64_t end_timestamp_offset_ns = 0; + + const int ret = get_data_from_buf_and_resample(gf, start_timestamp_offset_ns, + end_timestamp_offset_ns); + if (ret != 0) { + // if there's data on the whisper buffer - run inference as "final" segment + if (gf->whisper_buffer.size > 0) { + obs_log(gf->log_level, + "VAD disabled: no new input but whisper buffer has %lu bytes, run inference", + gf->whisper_buffer.size); + run_inference_and_callbacks(gf, last_vad_state.start_ts_offest_ms, + last_vad_state.end_ts_offset_ms, + VAD_STATE_WAS_OFF); + } + return last_vad_state; + } + + // push the data into gf-whisper_buffer + circlebuf_push_back(&gf->whisper_buffer, gf->resampled_buffer.data, + gf->resampled_buffer.size); + // clear the resampled buffer + circlebuf_pop_front(&gf->resampled_buffer, nullptr, gf->resampled_buffer.size); + + const uint64_t whisper_buf_samples = gf->whisper_buffer.size / sizeof(float); + const bool is_partial_segment = + whisper_buf_samples < (uint64_t)(gf->segment_duration * WHISPER_SAMPLE_RATE / 1000); + +#ifdef LOCALVOCAL_EXTRA_VERBOSE + obs_log(gf->log_level, + "VAD disabled: total %d frames (%lu bytes) in whisper buffer, state was %s new state is %s", + whisper_buf_samples, gf->whisper_buffer.size, last_vad_state.vad_on ? "ON" : "OFF", + is_partial_segment ? "PARTIAL" : "OFF"); +#endif + + const uint64_t end_ts_offset_ms = end_timestamp_offset_ns / 1000000; + + if (is_partial_segment) { + // check if we need to send the partial segment to inference based on + // the last partial segment end timestamp + const uint64_t unprocessed_length_ms = + end_ts_offset_ms - last_vad_state.last_partial_segment_end_ts; + if (unprocessed_length_ms > (uint64_t)gf->partial_latency) { + if (gf->partial_transcription) { + obs_log(gf->log_level, + "VAD disabled: partial segment with %lu ms unprocessed audio. start %lu, end %lu", + unprocessed_length_ms, last_vad_state.start_ts_offest_ms, + end_ts_offset_ms); + // Send to inference + run_inference_and_callbacks(gf, last_vad_state.start_ts_offest_ms, + end_ts_offset_ms, VAD_STATE_PARTIAL); + } else { + obs_log(gf->log_level, + "VAD disabled: partial segment with %lu ms unprocessed audio. start %lu, end %lu. Skipping.", + unprocessed_length_ms, last_vad_state.start_ts_offest_ms, + end_ts_offset_ms); + } + // update the last partial segment end timestamp + last_vad_state.last_partial_segment_end_ts = end_ts_offset_ms; + } + + return {false, last_vad_state.start_ts_offest_ms, end_ts_offset_ms, + last_vad_state.last_partial_segment_end_ts}; + } else { + obs_log(gf->log_level, + "VAD disabled: full segment end -> send to inference. start %lu, end %lu", + last_vad_state.start_ts_offest_ms, end_ts_offset_ms); + // send the entire buffer to inference + run_inference_and_callbacks(gf, last_vad_state.start_ts_offest_ms, end_ts_offset_ms, + VAD_STATE_WAS_OFF); + return {false, end_ts_offset_ms, end_ts_offset_ms, end_ts_offset_ms}; + } +} + vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_vad_state) { // get data from buffer and resample diff --git a/src/whisper-utils/vad-processing.h b/src/whisper-utils/vad-processing.h index 996002b..16a5992 100644 --- a/src/whisper-utils/vad-processing.h +++ b/src/whisper-utils/vad-processing.h @@ -1,9 +1,48 @@ #ifndef VAD_PROCESSING_H #define VAD_PROCESSING_H +/** + * @file vad-processing.h + * @brief Header file for Voice Activity Detection (VAD) processing utilities. + * + * This file contains the declarations of enums, structs, and functions used for + * VAD processing in the transcription filter. + */ + +/** + * @enum VadState + * @brief Enumeration of possible VAD states. + * + * - VAD_STATE_WAS_ON: VAD was previously on. + * - VAD_STATE_WAS_OFF: VAD was previously off. + * - VAD_STATE_IS_OFF: VAD is currently off. + * - VAD_STATE_PARTIAL: VAD is in a partial state. + */ enum VadState { VAD_STATE_WAS_ON = 0, VAD_STATE_WAS_OFF, VAD_STATE_IS_OFF, VAD_STATE_PARTIAL }; + +/** + * @enum VadMode + * @brief Enumeration of possible VAD modes. + * + * - VAD_MODE_ACTIVE: VAD is actively processing. + * - VAD_MODE_HYBRID: VAD is in hybrid mode. + * - VAD_MODE_DISABLED: VAD is disabled. + */ enum VadMode { VAD_MODE_ACTIVE = 0, VAD_MODE_HYBRID, VAD_MODE_DISABLED }; +/** + * @struct vad_state + * @brief Structure representing the state of VAD. + * + * @var vad_state::vad_on + * Indicates whether VAD is currently on. + * @var vad_state::start_ts_offest_ms + * Timestamp offset in milliseconds when VAD started. + * @var vad_state::end_ts_offset_ms + * Timestamp offset in milliseconds when VAD ended. + * @var vad_state::last_partial_segment_end_ts + * Timestamp of the end of the last partial segment. + */ struct vad_state { bool vad_on; uint64_t start_ts_offest_ms; @@ -11,6 +50,7 @@ struct vad_state { uint64_t last_partial_segment_end_ts; }; +vad_state vad_disabled_segmentation(transcription_filter_data *gf, vad_state last_vad_state); vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_vad_state); vad_state hybrid_vad_segmentation(transcription_filter_data *gf, vad_state last_vad_state); void initialize_vad(transcription_filter_data *gf, const char *silero_vad_model_file); diff --git a/src/whisper-utils/whisper-params.cpp b/src/whisper-utils/whisper-params.cpp new file mode 100644 index 0000000..883e2af --- /dev/null +++ b/src/whisper-utils/whisper-params.cpp @@ -0,0 +1,192 @@ +#include "whisper-params.h" + +#include + +#define MT_ obs_module_text + +void whisper_params_pretty_print(whisper_full_params ¶ms) +{ + obs_log(LOG_INFO, "Whisper params:"); + obs_log(LOG_INFO, "strategy: %s", + params.strategy == whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH + ? "beam_search" + : "greedy"); + obs_log(LOG_INFO, "n_threads: %d", params.n_threads); + obs_log(LOG_INFO, "n_max_text_ctx: %d", params.n_max_text_ctx); + obs_log(LOG_INFO, "offset_ms: %d", params.offset_ms); + obs_log(LOG_INFO, "duration_ms: %d", params.duration_ms); + obs_log(LOG_INFO, "translate: %s", params.translate ? "true" : "false"); + obs_log(LOG_INFO, "no_context: %s", params.no_context ? "true" : "false"); + obs_log(LOG_INFO, "no_timestamps: %s", params.no_timestamps ? "true" : "false"); + obs_log(LOG_INFO, "single_segment: %s", params.single_segment ? "true" : "false"); + obs_log(LOG_INFO, "print_special: %s", params.print_special ? "true" : "false"); + obs_log(LOG_INFO, "print_progress: %s", params.print_progress ? "true" : "false"); + obs_log(LOG_INFO, "print_realtime: %s", params.print_realtime ? "true" : "false"); + obs_log(LOG_INFO, "print_timestamps: %s", params.print_timestamps ? "true" : "false"); + obs_log(LOG_INFO, "token_timestamps: %s", params.token_timestamps ? "true" : "false"); + obs_log(LOG_INFO, "thold_pt: %f", params.thold_pt); + obs_log(LOG_INFO, "thold_ptsum: %f", params.thold_ptsum); + obs_log(LOG_INFO, "max_len: %d", params.max_len); + obs_log(LOG_INFO, "split_on_word: %s", params.split_on_word ? "true" : "false"); + obs_log(LOG_INFO, "max_tokens: %d", params.max_tokens); + obs_log(LOG_INFO, "debug_mode: %s", params.debug_mode ? "true" : "false"); + obs_log(LOG_INFO, "audio_ctx: %d", params.audio_ctx); + obs_log(LOG_INFO, "tdrz_enable: %s", params.tdrz_enable ? "true" : "false"); + obs_log(LOG_INFO, "suppress_regex: %s", params.suppress_regex); + obs_log(LOG_INFO, "initial_prompt: %s", params.initial_prompt); + obs_log(LOG_INFO, "language: %s", params.language); + obs_log(LOG_INFO, "detect_language: %s", params.detect_language ? "true" : "false"); + obs_log(LOG_INFO, "suppress_blank: %s", params.suppress_blank ? "true" : "false"); + obs_log(LOG_INFO, "suppress_non_speech_tokens: %s", + params.suppress_non_speech_tokens ? "true" : "false"); + obs_log(LOG_INFO, "temperature: %f", params.temperature); + obs_log(LOG_INFO, "max_initial_ts: %f", params.max_initial_ts); + obs_log(LOG_INFO, "length_penalty: %f", params.length_penalty); + obs_log(LOG_INFO, "temperature_inc: %f", params.temperature_inc); + obs_log(LOG_INFO, "entropy_thold: %f", params.entropy_thold); + obs_log(LOG_INFO, "logprob_thold: %f", params.logprob_thold); + obs_log(LOG_INFO, "no_speech_thold: %f", params.no_speech_thold); + obs_log(LOG_INFO, "greedy.best_of: %d", params.greedy.best_of); + obs_log(LOG_INFO, "beam_search.beam_size: %d", params.beam_search.beam_size); + obs_log(LOG_INFO, "beam_search.patience: %f", params.beam_search.patience); +} + +void apply_whisper_params_defaults_on_settings(obs_data_t *s) +{ + whisper_full_params whisper_params_tmp = whisper_full_default_params( + whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH); + + obs_data_set_default_int(s, "strategy", + whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH); + obs_data_set_default_int(s, "n_threads", whisper_params_tmp.n_threads); + obs_data_set_default_int(s, "n_max_text_ctx", whisper_params_tmp.n_max_text_ctx); + obs_data_set_default_int(s, "offset_ms", whisper_params_tmp.offset_ms); + obs_data_set_default_int(s, "duration_ms", whisper_params_tmp.duration_ms); + obs_data_set_default_bool(s, "whisper_translate", whisper_params_tmp.translate); + obs_data_set_default_bool(s, "no_context", whisper_params_tmp.no_context); + obs_data_set_default_bool(s, "no_timestamps", whisper_params_tmp.no_timestamps); + obs_data_set_default_bool(s, "single_segment", whisper_params_tmp.single_segment); + obs_data_set_default_bool(s, "print_special", false); + obs_data_set_default_bool(s, "print_progress", false); + obs_data_set_default_bool(s, "print_realtime", false); + obs_data_set_default_bool(s, "print_timestamps", false); + obs_data_set_default_bool(s, "token_timestamps", whisper_params_tmp.token_timestamps); + obs_data_set_default_double(s, "thold_pt", whisper_params_tmp.thold_pt); + obs_data_set_default_double(s, "thold_ptsum", whisper_params_tmp.thold_ptsum); + obs_data_set_default_int(s, "max_len", whisper_params_tmp.max_len); + obs_data_set_default_bool(s, "split_on_word", whisper_params_tmp.split_on_word); + obs_data_set_default_int(s, "max_tokens", whisper_params_tmp.max_tokens); + obs_data_set_default_bool(s, "debug_mode", whisper_params_tmp.debug_mode); + obs_data_set_default_int(s, "audio_ctx", whisper_params_tmp.audio_ctx); + obs_data_set_default_bool(s, "tdrz_enable", whisper_params_tmp.tdrz_enable); + obs_data_set_default_string(s, "suppress_regex", whisper_params_tmp.suppress_regex); + obs_data_set_default_string(s, "initial_prompt", whisper_params_tmp.initial_prompt); + // obs_data_set_default_string(s, "language", whisper_params_tmp.language); + obs_data_set_default_bool(s, "detect_language", whisper_params_tmp.detect_language); + obs_data_set_default_bool(s, "suppress_blank", false); + obs_data_set_default_bool(s, "suppress_non_speech_tokens", false); + obs_data_set_default_double(s, "temperature", whisper_params_tmp.temperature); + obs_data_set_default_double(s, "max_initial_ts", whisper_params_tmp.max_initial_ts); + obs_data_set_default_double(s, "length_penalty", whisper_params_tmp.length_penalty); + obs_data_set_default_double(s, "temperature_inc", whisper_params_tmp.temperature_inc); + obs_data_set_default_double(s, "entropy_thold", whisper_params_tmp.entropy_thold); + obs_data_set_default_double(s, "logprob_thold", whisper_params_tmp.logprob_thold); + obs_data_set_default_double(s, "no_speech_thold", whisper_params_tmp.no_speech_thold); + obs_data_set_default_int(s, "greedy.best_of", whisper_params_tmp.greedy.best_of); + obs_data_set_default_int(s, "beam_search.beam_size", + whisper_params_tmp.beam_search.beam_size); + obs_data_set_default_double(s, "beam_search.patience", + whisper_params_tmp.beam_search.patience); +} + +void apply_whisper_params_from_settings(whisper_full_params ¶ms, obs_data_t *settings) +{ + params = whisper_full_default_params( + (whisper_sampling_strategy)obs_data_get_int(settings, "strategy")); + params.n_threads = (int)obs_data_get_int(settings, "n_threads"); + params.n_max_text_ctx = (int)obs_data_get_int(settings, "n_max_text_ctx"); + params.offset_ms = (int)obs_data_get_int(settings, "offset_ms"); + params.duration_ms = (int)obs_data_get_int(settings, "duration_ms"); + params.translate = obs_data_get_bool(settings, "whisper_translate"); + params.no_context = obs_data_get_bool(settings, "no_context"); + params.no_timestamps = obs_data_get_bool(settings, "no_timestamps"); + params.single_segment = obs_data_get_bool(settings, "single_segment"); + params.print_special = obs_data_get_bool(settings, "print_special"); + params.print_progress = obs_data_get_bool(settings, "print_progress"); + params.print_realtime = obs_data_get_bool(settings, "print_realtime"); + params.print_timestamps = obs_data_get_bool(settings, "print_timestamps"); + params.token_timestamps = obs_data_get_bool(settings, "token_timestamps"); + params.thold_pt = (float)obs_data_get_double(settings, "thold_pt"); + params.thold_ptsum = (float)obs_data_get_double(settings, "thold_ptsum"); + params.max_len = (int)obs_data_get_int(settings, "max_len"); + params.split_on_word = obs_data_get_bool(settings, "split_on_word"); + params.max_tokens = (int)obs_data_get_int(settings, "max_tokens"); + params.debug_mode = obs_data_get_bool(settings, "debug_mode"); + params.audio_ctx = (int)obs_data_get_int(settings, "audio_ctx"); + params.tdrz_enable = obs_data_get_bool(settings, "tdrz_enable"); + params.suppress_regex = obs_data_get_string(settings, "suppress_regex"); + params.initial_prompt = obs_data_get_string(settings, "initial_prompt"); + // params.language = obs_data_get_string(settings, "language"); + params.detect_language = obs_data_get_bool(settings, "detect_language"); + params.suppress_blank = obs_data_get_bool(settings, "suppress_blank"); + params.suppress_non_speech_tokens = + obs_data_get_bool(settings, "suppress_non_speech_tokens"); + params.temperature = (float)obs_data_get_double(settings, "temperature"); + params.max_initial_ts = (float)obs_data_get_double(settings, "max_initial_ts"); + params.length_penalty = (float)obs_data_get_double(settings, "length_penalty"); + params.temperature_inc = (float)obs_data_get_double(settings, "temperature_inc"); + params.entropy_thold = (float)obs_data_get_double(settings, "entropy_thold"); + params.logprob_thold = (float)obs_data_get_double(settings, "logprob_thold"); + params.no_speech_thold = (float)obs_data_get_double(settings, "no_speech_thold"); + params.greedy.best_of = (int)obs_data_get_int(settings, "greedy.best_of"); + params.beam_search.beam_size = (int)obs_data_get_int(settings, "beam_search.beam_size"); + params.beam_search.patience = (float)obs_data_get_double(settings, "beam_search.patience"); +} + +void add_whisper_params_group_properties(obs_properties_t *ppts) +{ + obs_properties_t *g = obs_properties_create(); + obs_properties_add_group(ppts, "whisper_params_group", MT_("whisper_parameters"), + OBS_GROUP_NORMAL, g); + + obs_properties_add_list(g, "strategy", MT_("whisper_sampling_strategy"), + OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_INT); + obs_properties_add_int(g, "n_threads", MT_("n_threads"), 1, 8, 1); + obs_properties_add_int(g, "n_max_text_ctx", MT_("n_max_text_ctx"), 1, 100, 1); + obs_properties_add_int(g, "offset_ms", MT_("offset_ms"), 0, 10000, 100); + obs_properties_add_int(g, "duration_ms", MT_("duration_ms"), 0, 30000, 500); + obs_properties_add_bool(g, "whisper_translate", MT_("whisper_translate")); + obs_properties_add_bool(g, "no_context", MT_("no_context")); + obs_properties_add_bool(g, "no_timestamps", MT_("no_timestamps")); + obs_properties_add_bool(g, "single_segment", MT_("single_segment")); + obs_properties_add_bool(g, "print_special", MT_("print_special")); + obs_properties_add_bool(g, "print_progress", MT_("print_progress")); + obs_properties_add_bool(g, "print_realtime", MT_("print_realtime")); + obs_properties_add_bool(g, "print_timestamps", MT_("print_timestamps")); + obs_properties_add_bool(g, "token_timestamps", MT_("token_timestamps")); + obs_properties_add_float(g, "thold_pt", MT_("thold_pt"), 0, 1, 0.05); + obs_properties_add_float(g, "thold_ptsum", MT_("thold_ptsum"), 0, 1, 0.05); + obs_properties_add_int(g, "max_len", MT_("max_len"), 0, 1000, 1); + obs_properties_add_bool(g, "split_on_word", MT_("split_on_word")); + obs_properties_add_int(g, "max_tokens", MT_("max_tokens"), 0, 1000, 1); + obs_properties_add_bool(g, "debug_mode", MT_("debug_mode")); + obs_properties_add_int(g, "audio_ctx", MT_("audio_ctx"), 0, 10, 1); + obs_properties_add_bool(g, "tdrz_enable", MT_("tdrz_enable")); + obs_properties_add_text(g, "suppress_regex", MT_("suppress_regex"), OBS_TEXT_DEFAULT); + obs_properties_add_text(g, "initial_prompt", MT_("initial_prompt"), OBS_TEXT_DEFAULT); + // obs_properties_add_text(g, "language", MT_("language"), OBS_TEXT_DEFAULT); + obs_properties_add_bool(g, "detect_language", MT_("detect_language")); + obs_properties_add_bool(g, "suppress_blank", MT_("suppress_blank")); + obs_properties_add_bool(g, "suppress_non_speech_tokens", MT_("suppress_non_speech_tokens")); + obs_properties_add_float(g, "temperature", MT_("temperature"), 0, 1, 0.05); + obs_properties_add_float(g, "max_initial_ts", MT_("max_initial_ts"), 0, 100, 1); + obs_properties_add_float(g, "length_penalty", MT_("length_penalty"), 0, 1, 0.05); + obs_properties_add_float(g, "temperature_inc", MT_("temperature_inc"), 0, 1, 0.05); + obs_properties_add_float(g, "entropy_thold", MT_("entropy_thold"), 0, 1, 0.05); + obs_properties_add_float(g, "logprob_thold", MT_("logprob_thold"), 0, 1, 0.05); + obs_properties_add_float(g, "no_speech_thold", MT_("no_speech_thold"), 0, 1, 0.05); + obs_properties_add_int(g, "greedy.best_of", MT_("greedy.best_of"), 1, 10, 1); + obs_properties_add_int(g, "beam_search.beam_size", MT_("beam_search.beam_size"), 1, 10, 1); + obs_properties_add_float(g, "beam_search.patience", MT_("beam_search.patience"), 0, 1, + 0.05); +} diff --git a/src/whisper-utils/whisper-params.h b/src/whisper-utils/whisper-params.h new file mode 100644 index 0000000..2106b2f --- /dev/null +++ b/src/whisper-utils/whisper-params.h @@ -0,0 +1,52 @@ +#ifndef WHISPER_PARAMS_H +#define WHISPER_PARAMS_H + +#include "transcription-filter-data.h" + +/** + * @brief Prints the whisper parameters in a human-readable format. + * + * This function outputs the whisper parameters to the console in a formatted + * and readable manner. + * + * @param params Reference to the whisper_full_params structure. + */ +void whisper_params_pretty_print(whisper_full_params ¶ms); + +/** + * @brief Applies default whisper parameters to the given settings. + * + * This function sets the default values for whisper parameters on the provided + * OBS data settings object. It ensures that all necessary parameters have + * their default values, which can be used as a baseline for further + * customization. + * + * @param s A pointer to an obs_data_t structure representing the settings + * where the default whisper parameters will be applied. + */ +void apply_whisper_params_defaults_on_settings(obs_data_t *s); + +/** + * @brief Applies whisper parameters from the given settings. + * + * This function takes a reference to a `whisper_full_params` structure and an + * `obs_data_t` settings object, and applies the settings to the whisper parameters. + * + * @param params A reference to the `whisper_full_params` structure that will be modified. + * @param settings A pointer to the `obs_data_t` settings object containing the parameters to apply. + */ +void apply_whisper_params_from_settings(whisper_full_params ¶ms, obs_data_t *settings); + +/** + * @brief Adds whisper parameters group properties to the given OBS properties object. + * + * This function adds a group of properties related to whisper parameters to the + * specified OBS properties object. These properties can be used to configure + * whisper-related settings in the OBS application. + * + * @param ppts A pointer to an OBS properties object where the whisper parameters + * group properties will be added. + */ +void add_whisper_params_group_properties(obs_properties_t *ppts); + +#endif // WHISPER_PARAMS_H diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp index 3518edf..b53c5d4 100644 --- a/src/whisper-utils/whisper-processing.cpp +++ b/src/whisper-utils/whisper-processing.cpp @@ -198,10 +198,23 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter obs_log(gf->log_level, "Initial prompt: %s", gf->whisper_params.initial_prompt); } + obs_log(gf->log_level, "Running whisper inference. single segment? %s", + gf->whisper_params.single_segment ? "yes" : "no"); + // run the inference int whisper_full_result = -1; gf->whisper_params.duration_ms = (int)(whisper_duration_ms); try { + // whisper_full_params whisper_params_tmp = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH); + // whisper_params_tmp.language = gf->whisper_params.language; + // gf->whisper_params.no_timestamps = false; + // whisper_params_tmp.print_progress = false; + // whisper_params_tmp.print_timestamps = false; + // whisper_params_tmp.split_on_word = true; + // whisper_params_tmp.max_tokens = 100; + // whisper_params_tmp.suppress_blank = false; + // whisper_params_pretty_print(gf->whisper_params); + // whisper_params_pretty_print(whisper_params_tmp); whisper_full_result = whisper_full(gf->whisper_context, gf->whisper_params, pcm32f_data, (int)pcm32f_size); } catch (const std::exception &e) { @@ -355,7 +368,7 @@ void whisper_loop(void *data) obs_log(gf->log_level, "Starting whisper thread"); - vad_state current_vad_state = {false, now_ms(), 0, 0}; + vad_state current_vad_state = {false, 0, 0, 0}; const char *whisper_loop_name = "Whisper loop"; profile_register_root(whisper_loop_name, 50 * 1000 * 1000); @@ -377,6 +390,8 @@ void whisper_loop(void *data) current_vad_state = hybrid_vad_segmentation(gf, current_vad_state); } else if (gf->vad_mode == VAD_MODE_ACTIVE) { current_vad_state = vad_based_segmentation(gf, current_vad_state); + } else if (gf->vad_mode == VAD_MODE_DISABLED) { + current_vad_state = vad_disabled_segmentation(gf, current_vad_state); } if (!gf->cleared_last_sub) { @@ -399,7 +414,7 @@ void whisper_loop(void *data) // or if the whisper context is null std::unique_lock lock(gf->whisper_ctx_mutex); if (gf->input_buffers->size == 0) { - gf->wshiper_thread_cv.wait_for(lock, std::chrono::milliseconds(50)); + gf->wshiper_thread_cv.wait_for(lock, std::chrono::milliseconds(250)); } } diff --git a/src/whisper-utils/whisper-utils.h b/src/whisper-utils/whisper-utils.h index c62168b..b55a1ae 100644 --- a/src/whisper-utils/whisper-utils.h +++ b/src/whisper-utils/whisper-utils.h @@ -1,25 +1,81 @@ +/** + * @file whisper-utils.h + * @brief Utility functions for handling whisper transcription operations. + * + * This header file contains declarations for various utility functions used + * in the whisper transcription process, including thread management, sequence + * operations, and timestamp formatting. + * + * @note The timestamp conversion function is adapted from the whisper.cpp project. + * + * @see transcription-filter-data.h + */ #ifndef WHISPER_UTILS_H #define WHISPER_UTILS_H #include "transcription-filter-data.h" #include +#include +/** + * @brief Shuts down the whisper thread. + * + * This function terminates the whisper thread associated with the given + * transcription filter data. + * + * @param gf Pointer to the transcription filter data structure. + */ void shutdown_whisper_thread(struct transcription_filter_data *gf); + +/** + * @brief Starts the whisper thread with a specified path. + * + * This function initializes and starts the whisper thread using the provided + * transcription filter data, path, and Silero VAD model file. + * + * @param gf Pointer to the transcription filter data structure. + * @param path Reference to a string containing the path. + * @param silero_vad_model_file Pointer to a character array containing the Silero VAD model file. + */ void start_whisper_thread_with_path(struct transcription_filter_data *gf, const std::string &path, const char *silero_vad_model_file); +/** + * @brief Finds the start of overlap between two sequences. + * + * This function compares two sequences of whisper token data and determines + * the starting indices of their overlap. + * + * @param seq1 Reference to the first sequence of whisper token data. + * @param seq2 Reference to the second sequence of whisper token data. + * @return std::pair A pair of integers representing the starting indices of the overlap in seq1 and seq2. + */ std::pair findStartOfOverlap(const std::vector &seq1, const std::vector &seq2); + +/** + * @brief Reconstructs a sentence from two sequences. + * + * This function merges two sequences of whisper token data to reconstruct a + * complete sentence. + * + * @param seq1 Reference to the first sequence of whisper token data. + * @param seq2 Reference to the second sequence of whisper token data. + * @return std::vector A vector containing the reconstructed sentence. + */ std::vector reconstructSentence(const std::vector &seq1, const std::vector &seq2); /** - * @brief Convert a timestamp in milliseconds to a string in the format "MM:SS.sss" . - * Taken from https://github.com/ggerganov/whisper.cpp/blob/master/examples/stream/stream.cpp - * @param t_ms_offset Timestamp in milliseconds (offset from the beginning of the stream) - * @return std::string Timestamp in the format "MM:SS.sss" + * @brief Converts a timestamp in milliseconds to a string in the format "MM:SS.sss". + * + * This function takes a timestamp in milliseconds and converts it to a string + * representation in the format "MM:SS.sss". + * + * @param t_ms_offset Timestamp in milliseconds (offset from the beginning of the stream). + * @return std::string Timestamp in the format "MM:SS.sss". */ std::string to_timestamp(uint64_t t_ms_offset); -#endif /* WHISPER_UTILS_H */ +#endif // WHISPER_UTILS_H