From a46a13e4e070829f21028801a058a7f6b25d96c9 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Fri, 26 Jan 2024 15:38:47 -0500 Subject: [PATCH 1/2] Update buffer size and overlap size handling --- src/transcription-filter.cpp | 23 +++++++++++++++++------ src/whisper-utils/whisper-processing.cpp | 6 +++--- src/whisper-utils/whisper-processing.h | 6 ++---- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp index 04ae99d..1be2200 100644 --- a/src/transcription-filter.cpp +++ b/src/transcription-filter.cpp @@ -370,10 +370,11 @@ void transcription_filter_update(void *data, obs_data_t *s) gf->vad_enabled = obs_data_get_bool(s, "vad_enabled"); gf->log_words = obs_data_get_bool(s, "log_words"); + gf->frames = (size_t)((float)gf->sample_rate / (1000.0f / (float)obs_data_get_int(s, "buffer_size_msec"))); gf->caption_to_stream = obs_data_get_bool(s, "caption_to_stream"); bool step_by_step_processing = obs_data_get_bool(s, "step_by_step_processing"); gf->step_size_msec = step_by_step_processing ? (int)obs_data_get_int(s, "step_size_msec") - : BUFFER_SIZE_MSEC; + : obs_data_get_int(s, "buffer_size_msec"); gf->save_srt = obs_data_get_bool(s, "subtitle_save_srt"); gf->truncate_output_file = obs_data_get_bool(s, "truncate_output_file"); gf->save_only_while_recording = obs_data_get_bool(s, "only_while_recording"); @@ -457,7 +458,7 @@ void transcription_filter_update(void *data, obs_data_t *s) gf->whisper_params = whisper_full_default_params( (whisper_sampling_strategy)obs_data_get_int(s, "whisper_sampling_method")); - gf->whisper_params.duration_ms = BUFFER_SIZE_MSEC; + gf->whisper_params.duration_ms = (int)obs_data_get_int(s, "buffer_size_msec"); gf->whisper_params.language = obs_data_get_string(s, "whisper_language_select"); gf->whisper_params.initial_prompt = obs_data_get_string(s, "initial_prompt"); gf->whisper_params.n_threads = (int)obs_data_get_int(s, "n_threads"); @@ -493,12 +494,12 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter) // Get the number of channels for the input source gf->channels = audio_output_get_channels(obs_get_audio()); gf->sample_rate = audio_output_get_sample_rate(obs_get_audio()); - gf->frames = (size_t)((float)gf->sample_rate / (1000.0f / (float)BUFFER_SIZE_MSEC)); + gf->frames = (size_t)((float)gf->sample_rate / (1000.0f / (float)obs_data_get_int(settings, "buffer_size_msec"))); gf->last_num_frames = 0; bool step_by_step_processing = obs_data_get_bool(settings, "step_by_step_processing"); gf->step_size_msec = step_by_step_processing ? (int)obs_data_get_int(settings, "step_size_msec") - : BUFFER_SIZE_MSEC; + : obs_data_get_int(settings, "buffer_size_msec"); gf->min_sub_duration = (int)obs_data_get_int(settings, "min_sub_duration"); gf->last_sub_render_time = 0; gf->log_level = (int)obs_data_get_int(settings, "log_level"); @@ -523,7 +524,7 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter) gf->context = filter; - gf->overlap_ms = OVERLAP_SIZE_MSEC; + gf->overlap_ms = (int)obs_data_get_int(settings, "overlap_size_msec"); gf->overlap_frames = (size_t)((float)gf->sample_rate / (1000.0f / (float)gf->overlap_ms)); obs_log(gf->log_level, "transcription_filter: channels %d, frames %d, sample_rate %d", (int)gf->channels, (int)gf->frames, gf->sample_rate); @@ -644,6 +645,8 @@ void transcription_filter_defaults(obs_data_t *s) obs_data_set_default_bool(s, "truncate_output_file", false); obs_data_set_default_bool(s, "only_while_recording", false); obs_data_set_default_bool(s, "rename_file_to_match_recording", true); + obs_data_set_default_int(s, "buffer_size_msec", DEFAULT_BUFFER_SIZE_MSEC); + obs_data_set_default_int(s, "overlap_size_msec", DEFAULT_OVERLAP_SIZE_MSEC); obs_data_set_default_int(s, "step_size_msec", 1000); obs_data_set_default_int(s, "min_sub_duration", 3000); obs_data_set_default_bool(s, "advanced_settings", false); @@ -685,10 +688,16 @@ obs_properties_t *transcription_filter_properties(void *data) obs_properties_add_bool(ppts, "log_words", MT_("log_words")); obs_properties_add_bool(ppts, "caption_to_stream", MT_("caption_to_stream")); + + obs_properties_add_int_slider(ppts, "buffer_size_msec", MT_("buffer_size_msec"), 1000, + DEFAULT_BUFFER_SIZE_MSEC, 50); + obs_properties_add_int_slider(ppts, "overlap_size_msec", MT_("overlap_size_msec"), 50, + 300, 50); + obs_property_t *step_by_step_processing = obs_properties_add_bool( ppts, "step_by_step_processing", MT_("step_by_step_processing")); obs_properties_add_int_slider(ppts, "step_size_msec", MT_("step_size_msec"), 1000, - BUFFER_SIZE_MSEC, 50); + DEFAULT_BUFFER_SIZE_MSEC, 50); obs_properties_add_int_slider(ppts, "min_sub_duration", MT_("min_sub_duration"), 1000, 5000, 50); @@ -730,6 +739,8 @@ obs_properties_t *transcription_filter_properties(void *data) obs_property_set_visible(obs_properties_get(props, "subtitle_output_filename"), show_hide); obs_property_set_visible(obs_properties_get(props, "subtitle_save_srt"), show_hide); + obs_property_set_visible(obs_properties_get(props, "truncate_output_file"), + show_hide); obs_property_set_visible(obs_properties_get(props, "only_while_recording"), show_hide); obs_property_set_visible( diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp index 0b88ce1..22c2efa 100644 --- a/src/whisper-utils/whisper-processing.cpp +++ b/src/whisper-utils/whisper-processing.cpp @@ -55,12 +55,12 @@ float calculate_segment_energy(const float *pcmf32, size_t pcm32f_size) return energy / (float)pcm32f_size; } -size_t find_tail_word_cutoff(const float *pcmf32, size_t pcm32f_size, uint32_t sample_rate_hz) +size_t find_tail_word_cutoff(const float *pcmf32, size_t pcm32f_size, size_t overlap_ms, uint32_t sample_rate_hz) { // segment size: 10ms worth of samples const size_t segment_size = 10 * sample_rate_hz / 1000; // overlap size in samples - const size_t overlap_size = OVERLAP_SIZE_MSEC * sample_rate_hz / 1000; + const size_t overlap_size = overlap_ms * sample_rate_hz / 1000; // tail lookup window starting point const size_t tail_lookup_start = pcm32f_size - overlap_size; @@ -321,7 +321,7 @@ void process_audio_from_buffer(struct transcription_filter_data *gf) if (!skipped_inference) { // find the tail word cutoff const size_t tail_word_cutoff = - find_tail_word_cutoff(output[0], out_frames, WHISPER_SAMPLE_RATE); + find_tail_word_cutoff(output[0], out_frames, gf->overlap_ms, WHISPER_SAMPLE_RATE); if (tail_word_cutoff < out_frames) obs_log(gf->log_level, "tail word cutoff: %d frames", (int)tail_word_cutoff); diff --git a/src/whisper-utils/whisper-processing.h b/src/whisper-utils/whisper-processing.h index 19b10fa..edc8a66 100644 --- a/src/whisper-utils/whisper-processing.h +++ b/src/whisper-utils/whisper-processing.h @@ -2,11 +2,9 @@ #define WHISPER_PROCESSING_H // buffer size in msec -#define BUFFER_SIZE_MSEC 3000 -// at 16Khz, BUFFER_SIZE_MSEC is WHISPER_FRAME_SIZE samples -#define WHISPER_FRAME_SIZE 48000 +#define DEFAULT_BUFFER_SIZE_MSEC 3000 // overlap in msec -#define OVERLAP_SIZE_MSEC 100 +#define DEFAULT_OVERLAP_SIZE_MSEC 100 void whisper_loop(void *data); struct whisper_context *init_whisper_context(const std::string &model_path); From d3f2362866a499cf178dad43eeb9535d51400394 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Fri, 26 Jan 2024 15:39:05 -0500 Subject: [PATCH 2/2] Refactor buffer size calculation and formatting in transcription filter This commit refactors the buffer size calculation in the transcription filter code to improve readability and maintainability. The code now uses a more concise and formatted approach to calculate the buffer size in milliseconds. Additionally, the commit also improves the formatting and readability of the code in the whisper-processing file. These changes enhance the overall code quality and maintainability. --- src/transcription-filter.cpp | 12 +++++++----- src/whisper-utils/whisper-processing.cpp | 7 ++++--- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp index 1be2200..3d1dd50 100644 --- a/src/transcription-filter.cpp +++ b/src/transcription-filter.cpp @@ -370,7 +370,8 @@ void transcription_filter_update(void *data, obs_data_t *s) gf->vad_enabled = obs_data_get_bool(s, "vad_enabled"); gf->log_words = obs_data_get_bool(s, "log_words"); - gf->frames = (size_t)((float)gf->sample_rate / (1000.0f / (float)obs_data_get_int(s, "buffer_size_msec"))); + gf->frames = (size_t)((float)gf->sample_rate / + (1000.0f / (float)obs_data_get_int(s, "buffer_size_msec"))); gf->caption_to_stream = obs_data_get_bool(s, "caption_to_stream"); bool step_by_step_processing = obs_data_get_bool(s, "step_by_step_processing"); gf->step_size_msec = step_by_step_processing ? (int)obs_data_get_int(s, "step_size_msec") @@ -494,7 +495,8 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter) // Get the number of channels for the input source gf->channels = audio_output_get_channels(obs_get_audio()); gf->sample_rate = audio_output_get_sample_rate(obs_get_audio()); - gf->frames = (size_t)((float)gf->sample_rate / (1000.0f / (float)obs_data_get_int(settings, "buffer_size_msec"))); + gf->frames = (size_t)((float)gf->sample_rate / + (1000.0f / (float)obs_data_get_int(settings, "buffer_size_msec"))); gf->last_num_frames = 0; bool step_by_step_processing = obs_data_get_bool(settings, "step_by_step_processing"); gf->step_size_msec = step_by_step_processing @@ -690,9 +692,9 @@ obs_properties_t *transcription_filter_properties(void *data) obs_properties_add_bool(ppts, "caption_to_stream", MT_("caption_to_stream")); obs_properties_add_int_slider(ppts, "buffer_size_msec", MT_("buffer_size_msec"), 1000, - DEFAULT_BUFFER_SIZE_MSEC, 50); - obs_properties_add_int_slider(ppts, "overlap_size_msec", MT_("overlap_size_msec"), 50, - 300, 50); + DEFAULT_BUFFER_SIZE_MSEC, 50); + obs_properties_add_int_slider(ppts, "overlap_size_msec", MT_("overlap_size_msec"), 50, 300, + 50); obs_property_t *step_by_step_processing = obs_properties_add_bool( ppts, "step_by_step_processing", MT_("step_by_step_processing")); diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp index 22c2efa..f1fe8a8 100644 --- a/src/whisper-utils/whisper-processing.cpp +++ b/src/whisper-utils/whisper-processing.cpp @@ -55,7 +55,8 @@ float calculate_segment_energy(const float *pcmf32, size_t pcm32f_size) return energy / (float)pcm32f_size; } -size_t find_tail_word_cutoff(const float *pcmf32, size_t pcm32f_size, size_t overlap_ms, uint32_t sample_rate_hz) +size_t find_tail_word_cutoff(const float *pcmf32, size_t pcm32f_size, size_t overlap_ms, + uint32_t sample_rate_hz) { // segment size: 10ms worth of samples const size_t segment_size = 10 * sample_rate_hz / 1000; @@ -320,8 +321,8 @@ void process_audio_from_buffer(struct transcription_filter_data *gf) if (!skipped_inference) { // find the tail word cutoff - const size_t tail_word_cutoff = - find_tail_word_cutoff(output[0], out_frames, gf->overlap_ms, WHISPER_SAMPLE_RATE); + const size_t tail_word_cutoff = find_tail_word_cutoff( + output[0], out_frames, gf->overlap_ms, WHISPER_SAMPLE_RATE); if (tail_word_cutoff < out_frames) obs_log(gf->log_level, "tail word cutoff: %d frames", (int)tail_word_cutoff);