locaal-ai · royshil · Oct 29, 2023 · Oct 29, 2023
diff --git a/data/locale/en-US.ini b/data/locale/en-US.ini
@@ -41,3 +41,4 @@ save_srt="Save in SRT format (no file truncation)"
 only_while_recording="Write output only while recording"
 process_while_muted="Process speech while source is muted"
 rename_file_to_match_recording="Rename file to match recording"
+min_sub_duration="Minimal subtitle duration (msec)"
diff --git a/data/locale/pt_BR.ini b/data/locale/pt_BR.ini
@@ -42,3 +42,4 @@ save_srt="Salvar no formato SRT"
 only_while_recording="Escreva durante a gravação"
 process_while_muted="Processar enquanto está silenciada"
 rename_file_to_match_recording="Renomear arquivo para corresponder à gravação"
+min_sub_duration="Duração mínima da legenda (msec)"
diff --git a/data/locale/ru_RU.ini b/data/locale/ru_RU.ini
@@ -41,3 +41,4 @@ save_srt="Сохранить в формате SRT"
 only_while_recording="Записывать вывод только во время записи"
 process_while_muted="Обрабатывать речь, пока источник отключен"
 rename_file_to_match_recording="Переименовать файл, чтобы соответствовать записи"
+min_sub_duration="Минимальная длительность субтитров (мс)"
diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h
@@ -49,6 +49,10 @@ struct transcription_filter_data {
 	uint64_t start_timestamp_ms;
 	// Sentence counter for srt
 	size_t sentence_number;
+	// Minimal subtitle duration in ms
+	size_t min_sub_duration;
+	// Last time a subtitle was rendered
+	uint64_t last_sub_render_time;
 
 	/* PCM buffers */
 	float *copy_buffers[MAX_PREPROC_CHANNELS];

diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp
@@ -193,8 +193,23 @@ void acquire_weak_text_source_ref(struct transcription_filter_data *gf)
 	}
 }
 
-void set_text_callback(struct transcription_filter_data *gf, const DetectionResultWithText &result)
+void set_text_callback(struct transcription_filter_data *gf,
+		       const DetectionResultWithText &resultIn)
 {
+	DetectionResultWithText result = resultIn;
+	uint64_t now = now_ms();
+	if (result.text.empty() || result.result != DETECTION_RESULT_SPEECH) {
+		// check if we should clear the current sub depending on the minimum subtitle duration
+		if ((now - gf->last_sub_render_time) > gf->min_sub_duration) {
+			// clear the current sub, run an empty sub
+			result.text = "";
+		} else {
+			// nothing to do, the incoming sub is empty
+			return;
+		}
+	}
+	gf->last_sub_render_time = now;
+
 #ifdef _WIN32
 	// Some UTF8 charsets on Windows output have a bug, instead of 0xd? it outputs
 	// 0xf?, and 0xc? becomes 0xe?, so we need to replace it.
@@ -356,6 +371,8 @@ void transcription_filter_update(void *data, obs_data_t *s)
 	gf->start_timestamp_ms = now_ms();
 	gf->sentence_number = 1;
 	gf->process_while_muted = obs_data_get_bool(s, "process_while_muted");
+	gf->min_sub_duration = (int)obs_data_get_int(s, "min_sub_duration");
+	gf->last_sub_render_time = 0;
 
 	obs_log(gf->log_level, "transcription_filter: update text source");
 	// update the text source
@@ -475,7 +492,7 @@ void transcription_filter_update(void *data, obs_data_t *s)
 		}
 	} else {
 		// model path did not change
-		obs_log(LOG_INFO, "model path did not change: %s == %s", gf->whisper_model_path,
+		obs_log(LOG_DEBUG, "model path did not change: %s == %s", gf->whisper_model_path,
 			new_model_path.c_str());
 	}
 
@@ -531,6 +548,8 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
 	gf->step_size_msec = step_by_step_processing
 				     ? (int)obs_data_get_int(settings, "step_size_msec")
 				     : BUFFER_SIZE_MSEC;
+	gf->min_sub_duration = (int)obs_data_get_int(settings, "min_sub_duration");
+	gf->last_sub_render_time = 0;
 	gf->log_level = (int)obs_data_get_int(settings, "log_level");
 	gf->save_srt = obs_data_get_bool(settings, "subtitle_save_srt");
 	gf->save_only_while_recording = obs_data_get_bool(settings, "only_while_recording");
@@ -673,6 +692,7 @@ void transcription_filter_defaults(obs_data_t *s)
 	obs_data_set_default_bool(s, "only_while_recording", false);
 	obs_data_set_default_bool(s, "rename_file_to_match_recording", true);
 	obs_data_set_default_int(s, "step_size_msec", 1000);
+	obs_data_set_default_int(s, "min_sub_duration", 3000);
 
 	// Whisper parameters
 	obs_data_set_default_int(s, "whisper_sampling_method", WHISPER_SAMPLING_BEAM_SEARCH);
@@ -721,6 +741,8 @@ obs_properties_t *transcription_filter_properties(void *data)
 		ppts, "step_by_step_processing", MT_("step_by_step_processing"));
 	obs_properties_add_int_slider(ppts, "step_size_msec", MT_("step_size_msec"), 1000,
 				      BUFFER_SIZE_MSEC, 50);
+	obs_properties_add_int_slider(ppts, "min_sub_duration", MT_("min_sub_duration"), 1000, 5000,
+				      50);
 
 	obs_property_set_modified_callback(step_by_step_processing, [](obs_properties_t *props,
 								       obs_property_t *property,