Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

No VAD segmentation option #182

Merged
merged 10 commits into from
Nov 25, 2024
8 changes: 7 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ if(WIN32)
"cpu"
CACHE STRING "Acceleration to use")
endif()
set_property(CACHE ACCELERATION PROPERTY STRINGS "cpu" "hipblas" "cuda")
set_property(CACHE ACCELERATION PROPERTY STRINGS "cpu" "hipblas" "cuda" "vulkan")
endif()

include(cmake/BuildWhispercpp.cmake)
Expand Down Expand Up @@ -101,6 +101,11 @@ include(cmake/BuildICU.cmake)
target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE ICU)
target_include_directories(${CMAKE_PROJECT_NAME} SYSTEM PUBLIC ${ICU_INCLUDE_DIR})

# check env var for extra verbose logging
if(DEFINED ENV{LOCALVOCAL_EXTRA_VERBOSE})
target_compile_definitions(${CMAKE_PROJECT_NAME} PRIVATE LOCALVOCAL_EXTRA_VERBOSE)
endif()

target_sources(
${CMAKE_PROJECT_NAME}
PRIVATE src/plugin-main.c
Expand All @@ -117,6 +122,7 @@ target_sources(
src/whisper-utils/whisper-processing.cpp
src/whisper-utils/whisper-utils.cpp
src/whisper-utils/whisper-model-utils.cpp
src/whisper-utils/whisper-params.cpp
src/whisper-utils/silero-vad-onnx.cpp
src/whisper-utils/token-buffer-thread.cpp
src/whisper-utils/vad-processing.cpp
Expand Down
61 changes: 40 additions & 21 deletions data/locale/en-US.ini
Original file line number Diff line number Diff line change
Expand Up @@ -13,30 +13,9 @@ external_model_file="External model file"
whisper_parameters="Whisper Model Parameters"
language="Input Language"
whisper_sampling_method="Whisper Sampling Method"
n_threads="Number of threads"
n_max_text_ctx="Max text context"
translate="Translate"
translate_local="Local Translation"
translate_cloud="Cloud Translation"
no_context="No context"
single_segment="Single segment"
print_special="Print special"
print_progress="Print progress"
print_realtime="Print realtime"
print_timestamps="Print timestamps"
token_timestamps="Token timestamps"
thold_pt="Token prob. threshold"
thold_ptsum="Token sum prob. threshold"
max_len="Max length in chars"
split_on_word="Split on word"
max_tokens="Max tokens"
speed_up="Speed up"
initial_prompt="Initial prompt"
suppress_blank="Suppress blank"
suppress_non_speech_tokens="Suppress non-speech tokens"
temperature="Temperature"
max_initial_ts="Max initial timestamps"
length_penalty="Length penalty"
save_srt="Save in SRT format"
truncate_output_file="Truncate file on new sentence"
only_while_recording="Write output only while recording"
Expand Down Expand Up @@ -92,11 +71,51 @@ partial_latency="Latency (ms)"
vad_mode="VAD Mode"
Active_VAD="Active VAD"
Hybrid_VAD="Hybrid VAD"
No_VAD="No VAD"
translate_only_full_sentences="Translate only full sentences"
duration_filter_threshold="Duration filter"
segment_duration="Segment duration"
n_context_sentences="# Context sentences"
max_sub_duration="Max. sub duration (ms)"
# Whisper model parameters
strategy="Strategy"
n_threads="Number of threads"
n_max_text_ctx="Max text context"
offset_ms="Offset (ms)"
duration_ms="Duration (ms)"
whisper_translate="Translate"
no_context="No context"
no_timestamps="No timestamps"
single_segment="Single segment"
print_special="Print special"
print_progress="Print progress"
print_realtime="Print realtime"
print_timestamps="Print timestamps"
token_timestamps="Token timestamps"
thold_pt="Token prob. threshold"
thold_ptsum="Token sum prob. threshold"
max_len="Max length in chars"
split_on_word="Split on word"
max_tokens="Max tokens"
debug_mode="Debug mode"
audio_ctx="Audio context"
tdrz_enable="Enable TDRZ"
suppress_regex="Suppress regex"
initial_prompt="Initial prompt"
language="Input Language"
detect_language="Detect language"
suppress_blank="Suppress blank"
suppress_non_speech_tokens="Suppress non-speech tokens"
temperature="Temperature"
max_initial_ts="Max initial timestamps"
length_penalty="Length penalty"
temperature_inc="Temperature increment"
entropy_thold="Entropy threshold"
logprob_thold="Logprob threshold"
no_speech_thold="No speech threshold"
greedy.best_of="Greedy best of"
beam_search.beam_size="Beam size"
beam_search.patience="Patience"
Google-Cloud-Translation="Google Cloud Translation"
Microsoft-Translator="Microsoft Azure Translator"
Amazon-Translate="AWS Translate"
Expand Down
4 changes: 3 additions & 1 deletion src/transcription-filter-callbacks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,9 @@ void send_sentence_to_file(struct transcription_filter_data *gf,
openmode |= std::ios::app;
}
if (!gf->save_srt) {
// Write raw sentence to file
obs_log(gf->log_level, "Saving sentence '%s' to file %s", sentence.c_str(),
gf->output_file_path.c_str());
// Write raw sentence to text file (non-srt format)
try {
std::ofstream output_file(file_path, openmode);
output_file << sentence << std::endl;
Expand Down
1 change: 1 addition & 0 deletions src/transcription-filter-data.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ struct transcription_filter_data {
bool partial_transcription = false;
int partial_latency = 1000;
float duration_filter_threshold = 2.25f;
// Duration of the target segment buffer in ms
int segment_duration = 7000;

// Cloud translation options
Expand Down
104 changes: 4 additions & 100 deletions src/transcription-filter-properties.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@
#include "transcription-filter-utils.h"
#include "whisper-utils/whisper-language.h"
#include "whisper-utils/vad-processing.h"
#include "whisper-utils/whisper-params.h"
#include "model-utils/model-downloader-types.h"
#include "translation/language_codes.h"
#include "ui/filter-replace-dialog.h"
#include "ui/filter-replace-utils.h"

#include <string>
#include <vector>
#include "whisper-utils/whisper-utils.h"

bool translation_options_callback(obs_properties_t *props, obs_property_t *property,
obs_data_t *settings)
Expand Down Expand Up @@ -479,6 +481,7 @@ void add_advanced_group_properties(obs_properties_t *ppts, struct transcription_
obs_property_t *vad_mode_list =
obs_properties_add_list(advanced_config_group, "vad_mode", MT_("vad_mode"),
OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_INT);
obs_property_list_add_int(vad_mode_list, MT_("No_VAD"), VAD_MODE_DISABLED);
obs_property_list_add_int(vad_mode_list, MT_("Active_VAD"), VAD_MODE_ACTIVE);
obs_property_list_add_int(vad_mode_list, MT_("Hybrid_VAD"), VAD_MODE_HYBRID);
// add vad threshold slider
Expand Down Expand Up @@ -528,82 +531,6 @@ void add_logging_group_properties(obs_properties_t *ppts)
obs_property_list_add_int(list, "WARNING", LOG_WARNING);
}

void add_whisper_params_group_properties(obs_properties_t *ppts)
{
obs_properties_t *whisper_params_group = obs_properties_create();
obs_properties_add_group(ppts, "whisper_params_group", MT_("whisper_parameters"),
OBS_GROUP_NORMAL, whisper_params_group);

obs_property_t *whisper_sampling_method_list = obs_properties_add_list(
whisper_params_group, "whisper_sampling_method", MT_("whisper_sampling_method"),
OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_INT);
obs_property_list_add_int(whisper_sampling_method_list, "Beam search",
WHISPER_SAMPLING_BEAM_SEARCH);
obs_property_list_add_int(whisper_sampling_method_list, "Greedy", WHISPER_SAMPLING_GREEDY);

// add int slider for context sentences
obs_properties_add_int_slider(whisper_params_group, "n_context_sentences",
MT_("n_context_sentences"), 0, 5, 1);

// int n_threads;
obs_properties_add_int_slider(whisper_params_group, "n_threads", MT_("n_threads"), 1, 8, 1);
// int n_max_text_ctx; // max tokens to use from past text as prompt for the decoder
obs_properties_add_int_slider(whisper_params_group, "n_max_text_ctx", MT_("n_max_text_ctx"),
0, 16384, 100);
// int offset_ms; // start offset in ms
// int duration_ms; // audio duration to process in ms
// bool translate;
obs_properties_add_bool(whisper_params_group, "whisper_translate",
MT_("whisper_translate"));
// bool no_context; // do not use past transcription (if any) as initial prompt for the decoder
obs_properties_add_bool(whisper_params_group, "no_context", MT_("no_context"));
// bool single_segment; // force single segment output (useful for streaming)
obs_properties_add_bool(whisper_params_group, "single_segment", MT_("single_segment"));
// bool print_special; // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
obs_properties_add_bool(whisper_params_group, "print_special", MT_("print_special"));
// bool print_progress; // print progress information
obs_properties_add_bool(whisper_params_group, "print_progress", MT_("print_progress"));
// bool print_realtime; // print results from within whisper.cpp (avoid it, use callback instead)
obs_properties_add_bool(whisper_params_group, "print_realtime", MT_("print_realtime"));
// bool print_timestamps; // print timestamps for each text segment when printing realtime
obs_properties_add_bool(whisper_params_group, "print_timestamps", MT_("print_timestamps"));
// bool token_timestamps; // enable token-level timestamps
obs_properties_add_bool(whisper_params_group, "token_timestamps", MT_("token_timestamps"));
// enable DTW timestamps
obs_properties_add_bool(whisper_params_group, "dtw_token_timestamps",
MT_("dtw_token_timestamps"));
// float thold_pt; // timestamp token probability threshold (~0.01)
obs_properties_add_float_slider(whisper_params_group, "thold_pt", MT_("thold_pt"), 0.0f,
1.0f, 0.05f);
// float thold_ptsum; // timestamp token sum probability threshold (~0.01)
obs_properties_add_float_slider(whisper_params_group, "thold_ptsum", MT_("thold_ptsum"),
0.0f, 1.0f, 0.05f);
// int max_len; // max segment length in characters
obs_properties_add_int_slider(whisper_params_group, "max_len", MT_("max_len"), 0, 100, 1);
// bool split_on_word; // split on word rather than on token (when used with max_len)
obs_properties_add_bool(whisper_params_group, "split_on_word", MT_("split_on_word"));
// int max_tokens; // max tokens per segment (0 = no limit)
obs_properties_add_int_slider(whisper_params_group, "max_tokens", MT_("max_tokens"), 0, 100,
1);
// const char * initial_prompt;
obs_properties_add_text(whisper_params_group, "initial_prompt", MT_("initial_prompt"),
OBS_TEXT_DEFAULT);
// bool suppress_blank
obs_properties_add_bool(whisper_params_group, "suppress_blank", MT_("suppress_blank"));
// bool suppress_non_speech_tokens
obs_properties_add_bool(whisper_params_group, "suppress_non_speech_tokens",
MT_("suppress_non_speech_tokens"));
// float temperature
obs_properties_add_float_slider(whisper_params_group, "temperature", MT_("temperature"),
0.0f, 1.0f, 0.05f);
// float max_initial_ts
obs_properties_add_float_slider(whisper_params_group, "max_initial_ts",
MT_("max_initial_ts"), 0.0f, 1.0f, 0.05f);
// float length_penalty
obs_properties_add_float_slider(whisper_params_group, "length_penalty",
MT_("length_penalty"), -1.0f, 1.0f, 0.1f);
}

void add_general_group_properties(obs_properties_t *ppts)
{
// add "General" group
Expand Down Expand Up @@ -742,28 +669,5 @@ void transcription_filter_defaults(obs_data_t *s)
obs_data_set_default_string(s, "translate_cloud_region", "eastus");

// Whisper parameters
obs_data_set_default_int(s, "whisper_sampling_method", WHISPER_SAMPLING_BEAM_SEARCH);
obs_data_set_default_int(s, "n_context_sentences", 0);
obs_data_set_default_string(s, "initial_prompt", "");
obs_data_set_default_int(s, "n_threads", 4);
obs_data_set_default_int(s, "n_max_text_ctx", 16384);
obs_data_set_default_bool(s, "whisper_translate", false);
obs_data_set_default_bool(s, "no_context", true);
obs_data_set_default_bool(s, "single_segment", true);
obs_data_set_default_bool(s, "print_special", false);
obs_data_set_default_bool(s, "print_progress", false);
obs_data_set_default_bool(s, "print_realtime", false);
obs_data_set_default_bool(s, "print_timestamps", false);
obs_data_set_default_bool(s, "token_timestamps", false);
obs_data_set_default_bool(s, "dtw_token_timestamps", false);
obs_data_set_default_double(s, "thold_pt", 0.01);
obs_data_set_default_double(s, "thold_ptsum", 0.01);
obs_data_set_default_int(s, "max_len", 0);
obs_data_set_default_bool(s, "split_on_word", true);
obs_data_set_default_int(s, "max_tokens", 50);
obs_data_set_default_bool(s, "suppress_blank", false);
obs_data_set_default_bool(s, "suppress_non_speech_tokens", false);
obs_data_set_default_double(s, "temperature", 0.1);
obs_data_set_default_double(s, "max_initial_ts", 1.0);
obs_data_set_default_double(s, "length_penalty", -1.0);
apply_whisper_params_defaults_on_settings(s);
}
43 changes: 10 additions & 33 deletions src/transcription-filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "whisper-utils/whisper-language.h"
#include "whisper-utils/whisper-model-utils.h"
#include "whisper-utils/whisper-utils.h"
#include "whisper-utils/whisper-params.h"
#include "translation/language_codes.h"
#include "translation/translation-utils.h"
#include "translation/translation.h"
Expand Down Expand Up @@ -364,51 +365,27 @@ void transcription_filter_update(void *data, obs_data_t *s)
gf->sentence_psum_accept_thresh =
(float)obs_data_get_double(s, "sentence_psum_accept_thresh");

gf->whisper_params = whisper_full_default_params(
(whisper_sampling_strategy)obs_data_get_int(s, "whisper_sampling_method"));
gf->whisper_params.duration_ms = (int)obs_data_get_int(s, "buffer_size_msec");
apply_whisper_params_from_settings(gf->whisper_params, s);

if (!new_translate || gf->translation_model_index != "whisper-based-translation") {
const char *whisper_language_select =
obs_data_get_string(s, "whisper_language_select");
gf->whisper_params.language = (whisper_language_select != nullptr &&
strlen(whisper_language_select) > 0)
? whisper_language_select
: "auto";
const bool language_selected = whisper_language_select != nullptr &&
strlen(whisper_language_select) > 0;
gf->whisper_params.language = (language_selected) ? whisper_language_select
: "auto";
gf->whisper_params.detect_language = !language_selected;
} else {
// take the language from gf->target_lang
if (language_codes_to_whisper.count(gf->target_lang) > 0) {
gf->whisper_params.language =
language_codes_to_whisper[gf->target_lang].c_str();
gf->whisper_params.detect_language = false;
} else {
gf->whisper_params.language = "auto";
gf->whisper_params.detect_language = true;
}
}
gf->whisper_params.initial_prompt =
obs_data_get_string(s, "initial_prompt") != nullptr
? obs_data_get_string(s, "initial_prompt")
: "";
gf->whisper_params.n_threads = (int)obs_data_get_int(s, "n_threads");
gf->whisper_params.n_max_text_ctx = (int)obs_data_get_int(s, "n_max_text_ctx");
gf->whisper_params.translate = obs_data_get_bool(s, "whisper_translate");
gf->whisper_params.no_context = obs_data_get_bool(s, "no_context");
gf->whisper_params.single_segment = obs_data_get_bool(s, "single_segment");
gf->whisper_params.print_special = obs_data_get_bool(s, "print_special");
gf->whisper_params.print_progress = obs_data_get_bool(s, "print_progress");
gf->whisper_params.print_realtime = obs_data_get_bool(s, "print_realtime");
gf->whisper_params.print_timestamps = obs_data_get_bool(s, "print_timestamps");
gf->whisper_params.token_timestamps = obs_data_get_bool(s, "token_timestamps");
gf->whisper_params.thold_pt = (float)obs_data_get_double(s, "thold_pt");
gf->whisper_params.thold_ptsum = (float)obs_data_get_double(s, "thold_ptsum");
gf->whisper_params.max_len = (int)obs_data_get_int(s, "max_len");
gf->whisper_params.split_on_word = obs_data_get_bool(s, "split_on_word");
gf->whisper_params.max_tokens = (int)obs_data_get_int(s, "max_tokens");
gf->whisper_params.suppress_blank = obs_data_get_bool(s, "suppress_blank");
gf->whisper_params.suppress_non_speech_tokens =
obs_data_get_bool(s, "suppress_non_speech_tokens");
gf->whisper_params.temperature = (float)obs_data_get_double(s, "temperature");
gf->whisper_params.max_initial_ts = (float)obs_data_get_double(s, "max_initial_ts");
gf->whisper_params.length_penalty = (float)obs_data_get_double(s, "length_penalty");
gf->whisper_params.no_timestamps = true;

if (gf->vad) {
const float vad_threshold = (float)obs_data_get_double(s, "vad_threshold");
Expand Down
Loading
Loading