Skip to content

Commit

Permalink
No VAD segmentation option (#182)
Browse files Browse the repository at this point in the history
* Add support for disabled VAD mode and enhance CMake configuration

* Enhance VAD processing and transcription filter data structure with additional comments and logic adjustments

* Enhance VAD processing with improved logging, adjust single segment default, and update inference handling

* Refactor whisper parameter handling and enhance utility functions for better clarity and maintainability

* Add whisper parameters group properties and clean up related code

* Add whisper parameters handling and update related files for improved functionality

* Refactor whisper parameter type casting for improved clarity and consistency

* trigger build

* Fix logging message to use the correct variable for saved sentence
  • Loading branch information
royshil authored Nov 25, 2024
1 parent 04a6f6a commit a32e327
Show file tree
Hide file tree
Showing 12 changed files with 515 additions and 163 deletions.
8 changes: 7 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ if(WIN32)
"cpu"
CACHE STRING "Acceleration to use")
endif()
set_property(CACHE ACCELERATION PROPERTY STRINGS "cpu" "hipblas" "cuda")
set_property(CACHE ACCELERATION PROPERTY STRINGS "cpu" "hipblas" "cuda" "vulkan")
endif()

include(cmake/BuildWhispercpp.cmake)
Expand Down Expand Up @@ -101,6 +101,11 @@ include(cmake/BuildICU.cmake)
target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE ICU)
target_include_directories(${CMAKE_PROJECT_NAME} SYSTEM PUBLIC ${ICU_INCLUDE_DIR})

# check env var for extra verbose logging
if(DEFINED ENV{LOCALVOCAL_EXTRA_VERBOSE})
target_compile_definitions(${CMAKE_PROJECT_NAME} PRIVATE LOCALVOCAL_EXTRA_VERBOSE)
endif()

target_sources(
${CMAKE_PROJECT_NAME}
PRIVATE src/plugin-main.c
Expand All @@ -117,6 +122,7 @@ target_sources(
src/whisper-utils/whisper-processing.cpp
src/whisper-utils/whisper-utils.cpp
src/whisper-utils/whisper-model-utils.cpp
src/whisper-utils/whisper-params.cpp
src/whisper-utils/silero-vad-onnx.cpp
src/whisper-utils/token-buffer-thread.cpp
src/whisper-utils/vad-processing.cpp
Expand Down
61 changes: 40 additions & 21 deletions data/locale/en-US.ini
Original file line number Diff line number Diff line change
Expand Up @@ -13,30 +13,9 @@ external_model_file="External model file"
whisper_parameters="Whisper Model Parameters"
language="Input Language"
whisper_sampling_method="Whisper Sampling Method"
n_threads="Number of threads"
n_max_text_ctx="Max text context"
translate="Translate"
translate_local="Local Translation"
translate_cloud="Cloud Translation"
no_context="No context"
single_segment="Single segment"
print_special="Print special"
print_progress="Print progress"
print_realtime="Print realtime"
print_timestamps="Print timestamps"
token_timestamps="Token timestamps"
thold_pt="Token prob. threshold"
thold_ptsum="Token sum prob. threshold"
max_len="Max length in chars"
split_on_word="Split on word"
max_tokens="Max tokens"
speed_up="Speed up"
initial_prompt="Initial prompt"
suppress_blank="Suppress blank"
suppress_non_speech_tokens="Suppress non-speech tokens"
temperature="Temperature"
max_initial_ts="Max initial timestamps"
length_penalty="Length penalty"
save_srt="Save in SRT format"
truncate_output_file="Truncate file on new sentence"
only_while_recording="Write output only while recording"
Expand Down Expand Up @@ -92,11 +71,51 @@ partial_latency="Latency (ms)"
vad_mode="VAD Mode"
Active_VAD="Active VAD"
Hybrid_VAD="Hybrid VAD"
No_VAD="No VAD"
translate_only_full_sentences="Translate only full sentences"
duration_filter_threshold="Duration filter"
segment_duration="Segment duration"
n_context_sentences="# Context sentences"
max_sub_duration="Max. sub duration (ms)"
# Whisper model parameters
strategy="Strategy"
n_threads="Number of threads"
n_max_text_ctx="Max text context"
offset_ms="Offset (ms)"
duration_ms="Duration (ms)"
whisper_translate="Translate"
no_context="No context"
no_timestamps="No timestamps"
single_segment="Single segment"
print_special="Print special"
print_progress="Print progress"
print_realtime="Print realtime"
print_timestamps="Print timestamps"
token_timestamps="Token timestamps"
thold_pt="Token prob. threshold"
thold_ptsum="Token sum prob. threshold"
max_len="Max length in chars"
split_on_word="Split on word"
max_tokens="Max tokens"
debug_mode="Debug mode"
audio_ctx="Audio context"
tdrz_enable="Enable TDRZ"
suppress_regex="Suppress regex"
initial_prompt="Initial prompt"
language="Input Language"
detect_language="Detect language"
suppress_blank="Suppress blank"
suppress_non_speech_tokens="Suppress non-speech tokens"
temperature="Temperature"
max_initial_ts="Max initial timestamps"
length_penalty="Length penalty"
temperature_inc="Temperature increment"
entropy_thold="Entropy threshold"
logprob_thold="Logprob threshold"
no_speech_thold="No speech threshold"
greedy.best_of="Greedy best of"
beam_search.beam_size="Beam size"
beam_search.patience="Patience"
Google-Cloud-Translation="Google Cloud Translation"
Microsoft-Translator="Microsoft Azure Translator"
Amazon-Translate="AWS Translate"
Expand Down
4 changes: 3 additions & 1 deletion src/transcription-filter-callbacks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,9 @@ void send_sentence_to_file(struct transcription_filter_data *gf,
openmode |= std::ios::app;
}
if (!gf->save_srt) {
// Write raw sentence to file
obs_log(gf->log_level, "Saving sentence '%s' to file %s", sentence.c_str(),
gf->output_file_path.c_str());
// Write raw sentence to text file (non-srt format)
try {
std::ofstream output_file(file_path, openmode);
output_file << sentence << std::endl;
Expand Down
1 change: 1 addition & 0 deletions src/transcription-filter-data.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ struct transcription_filter_data {
bool partial_transcription = false;
int partial_latency = 1000;
float duration_filter_threshold = 2.25f;
// Duration of the target segment buffer in ms
int segment_duration = 7000;

// Cloud translation options
Expand Down
104 changes: 4 additions & 100 deletions src/transcription-filter-properties.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@
#include "transcription-filter-utils.h"
#include "whisper-utils/whisper-language.h"
#include "whisper-utils/vad-processing.h"
#include "whisper-utils/whisper-params.h"
#include "model-utils/model-downloader-types.h"
#include "translation/language_codes.h"
#include "ui/filter-replace-dialog.h"
#include "ui/filter-replace-utils.h"

#include <string>
#include <vector>
#include "whisper-utils/whisper-utils.h"

bool translation_options_callback(obs_properties_t *props, obs_property_t *property,
obs_data_t *settings)
Expand Down Expand Up @@ -479,6 +481,7 @@ void add_advanced_group_properties(obs_properties_t *ppts, struct transcription_
obs_property_t *vad_mode_list =
obs_properties_add_list(advanced_config_group, "vad_mode", MT_("vad_mode"),
OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_INT);
obs_property_list_add_int(vad_mode_list, MT_("No_VAD"), VAD_MODE_DISABLED);
obs_property_list_add_int(vad_mode_list, MT_("Active_VAD"), VAD_MODE_ACTIVE);
obs_property_list_add_int(vad_mode_list, MT_("Hybrid_VAD"), VAD_MODE_HYBRID);
// add vad threshold slider
Expand Down Expand Up @@ -528,82 +531,6 @@ void add_logging_group_properties(obs_properties_t *ppts)
obs_property_list_add_int(list, "WARNING", LOG_WARNING);
}

void add_whisper_params_group_properties(obs_properties_t *ppts)
{
obs_properties_t *whisper_params_group = obs_properties_create();
obs_properties_add_group(ppts, "whisper_params_group", MT_("whisper_parameters"),
OBS_GROUP_NORMAL, whisper_params_group);

obs_property_t *whisper_sampling_method_list = obs_properties_add_list(
whisper_params_group, "whisper_sampling_method", MT_("whisper_sampling_method"),
OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_INT);
obs_property_list_add_int(whisper_sampling_method_list, "Beam search",
WHISPER_SAMPLING_BEAM_SEARCH);
obs_property_list_add_int(whisper_sampling_method_list, "Greedy", WHISPER_SAMPLING_GREEDY);

// add int slider for context sentences
obs_properties_add_int_slider(whisper_params_group, "n_context_sentences",
MT_("n_context_sentences"), 0, 5, 1);

// int n_threads;
obs_properties_add_int_slider(whisper_params_group, "n_threads", MT_("n_threads"), 1, 8, 1);
// int n_max_text_ctx; // max tokens to use from past text as prompt for the decoder
obs_properties_add_int_slider(whisper_params_group, "n_max_text_ctx", MT_("n_max_text_ctx"),
0, 16384, 100);
// int offset_ms; // start offset in ms
// int duration_ms; // audio duration to process in ms
// bool translate;
obs_properties_add_bool(whisper_params_group, "whisper_translate",
MT_("whisper_translate"));
// bool no_context; // do not use past transcription (if any) as initial prompt for the decoder
obs_properties_add_bool(whisper_params_group, "no_context", MT_("no_context"));
// bool single_segment; // force single segment output (useful for streaming)
obs_properties_add_bool(whisper_params_group, "single_segment", MT_("single_segment"));
// bool print_special; // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
obs_properties_add_bool(whisper_params_group, "print_special", MT_("print_special"));
// bool print_progress; // print progress information
obs_properties_add_bool(whisper_params_group, "print_progress", MT_("print_progress"));
// bool print_realtime; // print results from within whisper.cpp (avoid it, use callback instead)
obs_properties_add_bool(whisper_params_group, "print_realtime", MT_("print_realtime"));
// bool print_timestamps; // print timestamps for each text segment when printing realtime
obs_properties_add_bool(whisper_params_group, "print_timestamps", MT_("print_timestamps"));
// bool token_timestamps; // enable token-level timestamps
obs_properties_add_bool(whisper_params_group, "token_timestamps", MT_("token_timestamps"));
// enable DTW timestamps
obs_properties_add_bool(whisper_params_group, "dtw_token_timestamps",
MT_("dtw_token_timestamps"));
// float thold_pt; // timestamp token probability threshold (~0.01)
obs_properties_add_float_slider(whisper_params_group, "thold_pt", MT_("thold_pt"), 0.0f,
1.0f, 0.05f);
// float thold_ptsum; // timestamp token sum probability threshold (~0.01)
obs_properties_add_float_slider(whisper_params_group, "thold_ptsum", MT_("thold_ptsum"),
0.0f, 1.0f, 0.05f);
// int max_len; // max segment length in characters
obs_properties_add_int_slider(whisper_params_group, "max_len", MT_("max_len"), 0, 100, 1);
// bool split_on_word; // split on word rather than on token (when used with max_len)
obs_properties_add_bool(whisper_params_group, "split_on_word", MT_("split_on_word"));
// int max_tokens; // max tokens per segment (0 = no limit)
obs_properties_add_int_slider(whisper_params_group, "max_tokens", MT_("max_tokens"), 0, 100,
1);
// const char * initial_prompt;
obs_properties_add_text(whisper_params_group, "initial_prompt", MT_("initial_prompt"),
OBS_TEXT_DEFAULT);
// bool suppress_blank
obs_properties_add_bool(whisper_params_group, "suppress_blank", MT_("suppress_blank"));
// bool suppress_non_speech_tokens
obs_properties_add_bool(whisper_params_group, "suppress_non_speech_tokens",
MT_("suppress_non_speech_tokens"));
// float temperature
obs_properties_add_float_slider(whisper_params_group, "temperature", MT_("temperature"),
0.0f, 1.0f, 0.05f);
// float max_initial_ts
obs_properties_add_float_slider(whisper_params_group, "max_initial_ts",
MT_("max_initial_ts"), 0.0f, 1.0f, 0.05f);
// float length_penalty
obs_properties_add_float_slider(whisper_params_group, "length_penalty",
MT_("length_penalty"), -1.0f, 1.0f, 0.1f);
}

void add_general_group_properties(obs_properties_t *ppts)
{
// add "General" group
Expand Down Expand Up @@ -742,28 +669,5 @@ void transcription_filter_defaults(obs_data_t *s)
obs_data_set_default_string(s, "translate_cloud_region", "eastus");

// Whisper parameters
obs_data_set_default_int(s, "whisper_sampling_method", WHISPER_SAMPLING_BEAM_SEARCH);
obs_data_set_default_int(s, "n_context_sentences", 0);
obs_data_set_default_string(s, "initial_prompt", "");
obs_data_set_default_int(s, "n_threads", 4);
obs_data_set_default_int(s, "n_max_text_ctx", 16384);
obs_data_set_default_bool(s, "whisper_translate", false);
obs_data_set_default_bool(s, "no_context", true);
obs_data_set_default_bool(s, "single_segment", true);
obs_data_set_default_bool(s, "print_special", false);
obs_data_set_default_bool(s, "print_progress", false);
obs_data_set_default_bool(s, "print_realtime", false);
obs_data_set_default_bool(s, "print_timestamps", false);
obs_data_set_default_bool(s, "token_timestamps", false);
obs_data_set_default_bool(s, "dtw_token_timestamps", false);
obs_data_set_default_double(s, "thold_pt", 0.01);
obs_data_set_default_double(s, "thold_ptsum", 0.01);
obs_data_set_default_int(s, "max_len", 0);
obs_data_set_default_bool(s, "split_on_word", true);
obs_data_set_default_int(s, "max_tokens", 50);
obs_data_set_default_bool(s, "suppress_blank", false);
obs_data_set_default_bool(s, "suppress_non_speech_tokens", false);
obs_data_set_default_double(s, "temperature", 0.1);
obs_data_set_default_double(s, "max_initial_ts", 1.0);
obs_data_set_default_double(s, "length_penalty", -1.0);
apply_whisper_params_defaults_on_settings(s);
}
43 changes: 10 additions & 33 deletions src/transcription-filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "whisper-utils/whisper-language.h"
#include "whisper-utils/whisper-model-utils.h"
#include "whisper-utils/whisper-utils.h"
#include "whisper-utils/whisper-params.h"
#include "translation/language_codes.h"
#include "translation/translation-utils.h"
#include "translation/translation.h"
Expand Down Expand Up @@ -364,51 +365,27 @@ void transcription_filter_update(void *data, obs_data_t *s)
gf->sentence_psum_accept_thresh =
(float)obs_data_get_double(s, "sentence_psum_accept_thresh");

gf->whisper_params = whisper_full_default_params(
(whisper_sampling_strategy)obs_data_get_int(s, "whisper_sampling_method"));
gf->whisper_params.duration_ms = (int)obs_data_get_int(s, "buffer_size_msec");
apply_whisper_params_from_settings(gf->whisper_params, s);

if (!new_translate || gf->translation_model_index != "whisper-based-translation") {
const char *whisper_language_select =
obs_data_get_string(s, "whisper_language_select");
gf->whisper_params.language = (whisper_language_select != nullptr &&
strlen(whisper_language_select) > 0)
? whisper_language_select
: "auto";
const bool language_selected = whisper_language_select != nullptr &&
strlen(whisper_language_select) > 0;
gf->whisper_params.language = (language_selected) ? whisper_language_select
: "auto";
gf->whisper_params.detect_language = !language_selected;
} else {
// take the language from gf->target_lang
if (language_codes_to_whisper.count(gf->target_lang) > 0) {
gf->whisper_params.language =
language_codes_to_whisper[gf->target_lang].c_str();
gf->whisper_params.detect_language = false;
} else {
gf->whisper_params.language = "auto";
gf->whisper_params.detect_language = true;
}
}
gf->whisper_params.initial_prompt =
obs_data_get_string(s, "initial_prompt") != nullptr
? obs_data_get_string(s, "initial_prompt")
: "";
gf->whisper_params.n_threads = (int)obs_data_get_int(s, "n_threads");
gf->whisper_params.n_max_text_ctx = (int)obs_data_get_int(s, "n_max_text_ctx");
gf->whisper_params.translate = obs_data_get_bool(s, "whisper_translate");
gf->whisper_params.no_context = obs_data_get_bool(s, "no_context");
gf->whisper_params.single_segment = obs_data_get_bool(s, "single_segment");
gf->whisper_params.print_special = obs_data_get_bool(s, "print_special");
gf->whisper_params.print_progress = obs_data_get_bool(s, "print_progress");
gf->whisper_params.print_realtime = obs_data_get_bool(s, "print_realtime");
gf->whisper_params.print_timestamps = obs_data_get_bool(s, "print_timestamps");
gf->whisper_params.token_timestamps = obs_data_get_bool(s, "token_timestamps");
gf->whisper_params.thold_pt = (float)obs_data_get_double(s, "thold_pt");
gf->whisper_params.thold_ptsum = (float)obs_data_get_double(s, "thold_ptsum");
gf->whisper_params.max_len = (int)obs_data_get_int(s, "max_len");
gf->whisper_params.split_on_word = obs_data_get_bool(s, "split_on_word");
gf->whisper_params.max_tokens = (int)obs_data_get_int(s, "max_tokens");
gf->whisper_params.suppress_blank = obs_data_get_bool(s, "suppress_blank");
gf->whisper_params.suppress_non_speech_tokens =
obs_data_get_bool(s, "suppress_non_speech_tokens");
gf->whisper_params.temperature = (float)obs_data_get_double(s, "temperature");
gf->whisper_params.max_initial_ts = (float)obs_data_get_double(s, "max_initial_ts");
gf->whisper_params.length_penalty = (float)obs_data_get_double(s, "length_penalty");
gf->whisper_params.no_timestamps = true;

if (gf->vad) {
const float vad_threshold = (float)obs_data_get_double(s, "vad_threshold");
Expand Down
Loading

0 comments on commit a32e327

Please sign in to comment.