Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bump whisper.cpp. Simple settings mode #60

Merged
merged 2 commits into from
Dec 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions buildspec.json
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@
}
},
"name": "obs-localvocal",
"version": "0.0.7",
"version": "0.0.8",
"author": "Roy Shilkrot",
"website": "https://github.com/obs-ai/obs-localvocal",
"website": "https://github.com/occ-ai/obs-localvocal",
"email": "[email protected]",
"uuids": {
"macosPackage": "CB66E5DF-FF45-4BEA-B38B-7AD3705860C9",
Expand Down
2 changes: 1 addition & 1 deletion cmake/BuildWhispercpp.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ include(ExternalProject)

set(CMAKE_OSX_ARCHITECTURES_ "arm64$<SEMICOLON>x86_64")

set(Whispercpp_Build_GIT_TAG "ec7a6f04f9c32adec2e6b0995b8c728c5bf56f35")
set(Whispercpp_Build_GIT_TAG "8986690c2a7b81b2b5d79cdc186b5aa672311740")

if(${CMAKE_BUILD_TYPE} STREQUAL Release OR ${CMAKE_BUILD_TYPE} STREQUAL RelWithDebInfo)
set(Whispercpp_BUILD_TYPE Release)
Expand Down
11 changes: 6 additions & 5 deletions data/locale/en-US.ini
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
LocalVocalPlugin="LocalVocal Plugin"
transcription_filterAudioFilter="LocalVocal Transcription"
vad_enabled="VAD Enabled"
log_level="Log Level"
log_words="Log Output Words"
log_level="Internal Log Level"
log_words="Log Output to Console"
caption_to_stream="Stream Captions"
step_by_step_processing="Step-by-step processing (⚠️ processing will increase)"
step_by_step_processing="Step-by-step processing (⚠️ increased processing)"
step_size_msec="Step size (ms)"
subtitle_sources="Subtitles Output"
none_no_output="None / No output"
text_file_output="Text File output"
output_filename="Output filename"
whisper_model="Whisper Model"
external_model_file="External model file"
whisper_parameters="Whisper Parameters"
whisper_parameters="Advanced Settings"
language="Language"
whisper_sampling_method="Whisper Sampling Method"
n_threads="Number of threads"
Expand Down Expand Up @@ -41,4 +41,5 @@ save_srt="Save in SRT format (no file truncation)"
only_while_recording="Write output only while recording"
process_while_muted="Process speech while source is muted"
rename_file_to_match_recording="Rename file to match recording"
min_sub_duration="Minimal subtitle duration (msec)"
min_sub_duration="Min. sub duration (ms)"
advanced_settings="Advanced Settings"
42 changes: 31 additions & 11 deletions src/transcription-filter-data.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,11 +60,11 @@ struct transcription_filter_data {
struct circlebuf input_buffers[MAX_PREPROC_CHANNELS];

/* Resampler */
audio_resampler_t *resampler = nullptr;
audio_resampler_t *resampler;

/* whisper */
char *whisper_model_path = nullptr;
struct whisper_context *whisper_context = nullptr;
char *whisper_model_path;
struct whisper_context *whisper_context;
whisper_full_params whisper_params;

float filler_p_threshold;
Expand All @@ -81,21 +81,41 @@ struct transcription_filter_data {
bool rename_file_to_match_recording = false;

// Text source to output the subtitles
obs_weak_source_t *text_source = nullptr;
char *text_source_name = nullptr;
std::mutex *text_source_mutex = nullptr;
obs_weak_source_t *text_source;
char *text_source_name;
std::mutex *text_source_mutex;
// Callback to set the text in the output text source (subtitles)
std::function<void(const DetectionResultWithText &result)> setTextCallback;
// Output file path to write the subtitles
std::string output_file_path = "";
std::string whisper_model_file_currently_loaded = "";
std::string output_file_path;
std::string whisper_model_file_currently_loaded;

// Use std for thread and mutex
std::thread whisper_thread;

std::mutex *whisper_buf_mutex = nullptr;
std::mutex *whisper_ctx_mutex = nullptr;
std::condition_variable *wshiper_thread_cv = nullptr;
std::mutex *whisper_buf_mutex;
std::mutex *whisper_ctx_mutex;
std::condition_variable *wshiper_thread_cv;

// ctor
transcription_filter_data()
{
// initialize all pointers to nullptr
for (size_t i = 0; i < MAX_PREPROC_CHANNELS; i++) {
copy_buffers[i] = nullptr;
}
context = nullptr;
resampler = nullptr;
whisper_model_path = nullptr;
whisper_context = nullptr;
text_source = nullptr;
text_source_mutex = nullptr;
whisper_buf_mutex = nullptr;
whisper_ctx_mutex = nullptr;
wshiper_thread_cv = nullptr;
output_file_path = "";
whisper_model_file_currently_loaded = "";
}
};

// Audio packet info
Expand Down
60 changes: 36 additions & 24 deletions src/transcription-filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
#include <Windows.h>
#endif

#include <QString>

inline enum speaker_layout convert_speaker_layout(uint8_t channels)
{
switch (channels) {
Expand Down Expand Up @@ -125,19 +127,7 @@ void transcription_filter_destroy(void *data)
static_cast<struct transcription_filter_data *>(data);

obs_log(gf->log_level, "transcription_filter_destroy");
{
std::lock_guard<std::mutex> lock(*gf->whisper_ctx_mutex);
if (gf->whisper_context != nullptr) {
whisper_free(gf->whisper_context);
gf->whisper_context = nullptr;
gf->wshiper_thread_cv->notify_all();
}
}

// join the thread
if (gf->whisper_thread.joinable()) {
gf->whisper_thread.join();
}
shutdown_whisper_thread(gf);

if (gf->text_source_name) {
bfree(gf->text_source_name);
Expand Down Expand Up @@ -448,14 +438,14 @@ void transcription_filter_update(void *data, obs_data_t *s)
obs_weak_source_release(old_weak_text_source);
}

obs_log(gf->log_level, "transcription_filter: update whisper model");
update_whsiper_model_path(gf, s);

if (!gf->whisper_ctx_mutex) {
if (gf->whisper_ctx_mutex == nullptr) {
obs_log(LOG_ERROR, "whisper_ctx_mutex is null");
return;
}

obs_log(gf->log_level, "transcription_filter: update whisper model");
update_whsiper_model_path(gf, s);

obs_log(gf->log_level, "transcription_filter: update whisper params");
std::lock_guard<std::mutex> lock(*gf->whisper_ctx_mutex);

Expand Down Expand Up @@ -492,7 +482,7 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
{
obs_log(LOG_INFO, "transcription filter create");

struct transcription_filter_data *gf = new transcription_filter_data;
struct transcription_filter_data *gf = new transcription_filter_data();

// Get the number of channels for the input source
gf->channels = audio_output_get_channels(obs_get_audio());
Expand Down Expand Up @@ -648,6 +638,7 @@ void transcription_filter_defaults(obs_data_t *s)
obs_data_set_default_bool(s, "rename_file_to_match_recording", true);
obs_data_set_default_int(s, "step_size_msec", 1000);
obs_data_set_default_int(s, "min_sub_duration", 3000);
obs_data_set_default_bool(s, "advanced_settings", false);

// Whisper parameters
obs_data_set_default_int(s, "whisper_sampling_method", WHISPER_SAMPLING_BEAM_SEARCH);
Expand Down Expand Up @@ -684,12 +675,6 @@ obs_properties_t *transcription_filter_properties(void *data)

obs_properties_t *ppts = obs_properties_create();

obs_properties_add_bool(ppts, "vad_enabled", MT_("vad_enabled"));
obs_property_t *list = obs_properties_add_list(ppts, "log_level", MT_("log_level"),
OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_INT);
obs_property_list_add_int(list, "DEBUG", LOG_DEBUG);
obs_property_list_add_int(list, "INFO", LOG_INFO);
obs_property_list_add_int(list, "WARNING", LOG_WARNING);
obs_properties_add_bool(ppts, "log_words", MT_("log_words"));
obs_properties_add_bool(ppts, "caption_to_stream", MT_("caption_to_stream"));
obs_property_t *step_by_step_processing = obs_properties_add_bool(
Expand Down Expand Up @@ -799,10 +784,31 @@ obs_properties_t *transcription_filter_properties(void *data)
return true;
});

obs_property_t *advanced_settings_prop =
obs_properties_add_bool(ppts, "advanced_settings", MT_("advanced_settings"));
obs_property_set_modified_callback(advanced_settings_prop, [](obs_properties_t *props,
obs_property_t *property,
obs_data_t *settings) {
UNUSED_PARAMETER(property);
// If advanced settings is enabled, show the advanced settings group
const bool show_hide = obs_data_get_bool(settings, "advanced_settings");
obs_property_set_visible(obs_properties_get(props, "whisper_params_group"),
show_hide);
return true;
});

obs_properties_t *whisper_params_group = obs_properties_create();
obs_properties_add_group(ppts, "whisper_params_group", MT_("whisper_parameters"),
OBS_GROUP_NORMAL, whisper_params_group);

obs_properties_add_bool(whisper_params_group, "vad_enabled", MT_("vad_enabled"));
obs_property_t *list = obs_properties_add_list(whisper_params_group, "log_level",
MT_("log_level"), OBS_COMBO_TYPE_LIST,
OBS_COMBO_FORMAT_INT);
obs_property_list_add_int(list, "DEBUG", LOG_DEBUG);
obs_property_list_add_int(list, "INFO", LOG_INFO);
obs_property_list_add_int(list, "WARNING", LOG_WARNING);

// Add language selector
obs_property_t *whisper_language_select_list = obs_properties_add_list(
whisper_params_group, "whisper_language_select", MT_("language"),
Expand Down Expand Up @@ -885,6 +891,12 @@ obs_properties_t *transcription_filter_properties(void *data)
obs_properties_add_float_slider(whisper_params_group, "length_penalty",
MT_("length_penalty"), -1.0f, 1.0f, 0.1f);

// Add a informative text about the plugin
obs_properties_add_text(
ppts, "info",
QString(PLUGIN_INFO_TEMPLATE).arg(PLUGIN_VERSION).toStdString().c_str(),
OBS_TEXT_INFO);

UNUSED_PARAMETER(data);
return ppts;
}
5 changes: 5 additions & 0 deletions src/transcription-filter.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ void transcription_filter_deactivate(void *data);
void transcription_filter_defaults(obs_data_t *s);
obs_properties_t *transcription_filter_properties(void *data);

const char *const PLUGIN_INFO_TEMPLATE =
"<a href=\"https://github.com/occ-ai/obs-localvocal/\">LocalVocal</a> (%1) by "
"<a href=\"https://github.com/occ-ai\">OCC AI</a> ❤️ "
"<a href=\"https://www.patreon.com/RoyShilkrot\">Support & Follow</a>";

#ifdef __cplusplus
}
#endif
2 changes: 1 addition & 1 deletion src/whisper-utils/whisper-utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ void shutdown_whisper_thread(struct transcription_filter_data *gf)
void start_whisper_thread_with_path(struct transcription_filter_data *gf, const std::string &path)
{
obs_log(gf->log_level, "start_whisper_thread_with_path: %s", path.c_str());
if (!gf->whisper_ctx_mutex) {
if (gf->whisper_ctx_mutex == nullptr) {
obs_log(LOG_ERROR, "cannot init whisper: whisper_ctx_mutex is null");
return;
}
Expand Down
Loading