Skip to content

Commit

Permalink
Merge pull request #36 from obs-ai/roy.srt_output_muted_disable
Browse files Browse the repository at this point in the history
srt subtitles file saving and source mute detection
  • Loading branch information
royshil authored Oct 12, 2023
2 parents a02b2ac + 82a473f commit b21a42a
Show file tree
Hide file tree
Showing 8 changed files with 199 additions and 45 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,12 @@ The plugin was built and tested on Mac OSX (Intel & Apple silicon), Windows and

Start by cloning this repo to a directory of your choice.

Remember to sync and fetch the submodules before building, e.g.
```sh
$ git submodule sync --recursive
$ git update --init --recursive
```

### Mac OSX

Using the CI pipeline scripts, locally you would just call the zsh script. By default this builds a universal binary for both Intel and Apple Silicon. To build for a specific architecture please see `.github/scripts/.build.zsh` for the `-arch` options.
Expand Down
2 changes: 1 addition & 1 deletion cmake/linux/defaults.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ set(CPACK_SOURCE_IGNORE_FILES
# cmake-format: sortable
".*~$"
\\.git/
\\.github/
# \\.github/
\\.gitignore
build_.*
cmake/\\.CMakeBuildNumber
Expand Down
6 changes: 5 additions & 1 deletion data/locale/en-US.ini
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,8 @@ suppress_blank="Suppress blank"
suppress_non_speech_tokens="Suppress non-speech tokens"
temperature="Temperature"
max_initial_ts="Max initial timestamps"
length_penalty="Length penalty"
length_penalty="Length penalty"
save_srt="Save in SRT format (no file truncation)"
only_while_recording="Write output only while recording"
process_while_muted="Process speech while source is muted"
rename_file_to_match_recording="Rename file to match recording"
4 changes: 4 additions & 0 deletions data/locale/pt_BR.ini
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,7 @@ suppress_non_speech_tokens="Suprimir tokens não fala"
temperature="Temperatura"
max_initial_ts="Tempo inicial máximo"
length_penalty="Pena de comprimento"
save_srt="Salvar no formato SRT"
only_while_recording="Escreva durante a gravação"
process_while_muted="Processar enquanto está silenciada"
rename_file_to_match_recording="Renomear arquivo para corresponder à gravação"
4 changes: 4 additions & 0 deletions data/locale/ru_RU.ini
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,7 @@ suppress_non_speech_tokens="Подавить токены, не относящи
temperature="Температура"
max_initial_ts="Максимальное начальное время"
length_penalty="Штраф за длину"
save_srt="Сохранить в формате SRT"
only_while_recording="Записывать вывод только во время записи"
process_while_muted="Обрабатывать речь, пока источник отключен"
rename_file_to_match_recording="Переименовать файл, чтобы соответствовать записи"
27 changes: 24 additions & 3 deletions src/transcription-filter-data.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,21 @@

#define MT_ obs_module_text

enum DetectionResult {
DETECTION_RESULT_UNKNOWN = 0,
DETECTION_RESULT_SILENCE = 1,
DETECTION_RESULT_SPEECH = 2,
};

struct DetectionResultWithText {
DetectionResult result;
std::string text;
uint64_t start_timestamp_ms;
uint64_t end_timestamp_ms;
};

struct transcription_filter_data {
obs_source_t *context; // obs input source
obs_source_t *context; // obs filter source (this filter)
size_t channels; // number of channels
uint32_t sample_rate; // input sample rate
// How many input frames (in input sample rate) are needed for the next whisper frame
Expand All @@ -32,6 +45,10 @@ struct transcription_filter_data {
size_t last_num_frames;
// Milliseconds per processing step (e.g. rest of the whisper buffer may be filled with silence)
size_t step_size_msec;
// Start begining timestamp in ms since epoch
uint64_t start_timestamp_ms;
// Sentence counter for srt
size_t sentence_number;

/* PCM buffers */
float *copy_buffers[MAX_PREPROC_CHANNELS];
Expand All @@ -54,13 +71,17 @@ struct transcription_filter_data {
bool log_words;
bool caption_to_stream;
bool active = false;
bool save_srt = false;
bool save_only_while_recording = false;
bool process_while_muted = false;
bool rename_file_to_match_recording = false;

// Text source to output the subtitles
obs_weak_source_t *text_source = nullptr;
char *text_source_name = nullptr;
std::mutex *text_source_mutex = nullptr;
// Callback to set the text in the output text source (subtitles)
std::function<void(const std::string &str)> setTextCallback;
std::function<void(const DetectionResultWithText &result)> setTextCallback;
// Output file path to write the subtitles
std::string output_file_path = "";
std::string whisper_model_file_currently_loaded = "";
Expand All @@ -79,6 +100,6 @@ struct transcription_filter_audio_info {
uint64_t timestamp;
};

void set_text_callback(struct transcription_filter_data *gf, const std::string &str);
void set_text_callback(struct transcription_filter_data *gf, const DetectionResultWithText &str);

#endif /* TRANSCRIPTION_FILTER_DATA_H */
151 changes: 134 additions & 17 deletions src/transcription-filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@ inline enum speaker_layout convert_speaker_layout(uint8_t channels)
}
}

inline uint64_t now_ms()
{
return std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::system_clock::now().time_since_epoch())
.count();
}

bool add_sources_to_list(void *list_property, obs_source_t *source)
{
auto source_id = obs_source_get_id(source);
Expand Down Expand Up @@ -71,6 +78,13 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_
return audio;
}

// Check if the parent source is muted
obs_source_t *parent_source = obs_filter_get_parent(gf->context);
if (gf->process_while_muted == false && obs_source_muted(parent_source)) {
// Source is muted, do not process audio
return audio;
}

if (gf->whisper_context == nullptr) {
// Whisper not initialized, just pass through
return audio;
Expand Down Expand Up @@ -179,35 +193,75 @@ void acquire_weak_text_source_ref(struct transcription_filter_data *gf)
}
}

void set_text_callback(struct transcription_filter_data *gf, const std::string &str)
void set_text_callback(struct transcription_filter_data *gf, const DetectionResultWithText &result)
{
#ifdef _WIN32
// Russian UTF8 charset on Windows output has a bug, instead of 0xd? it outputs
// 0xf?, so we need to replace it. This doesn't affect any other charset, which
// outputs the correct UTF8 output. (Except maybe for Greek?)
std::string str_copy = str;
std::string str_copy = result.text;
for (size_t i = 0; i < str_copy.size(); ++i) {
// if the char MSBs starts with 0xf replace the MSBs with 0xd
if ((str_copy.c_str()[i] & 0xf0) == 0xf0) {
str_copy[i] = (str_copy.c_str()[i] & 0x0f) | 0xd0;
}
}
#else
std::string str_copy = str;
std::string str_copy = result.text;
#endif

if (gf->caption_to_stream) {
obs_output_t *streaming_output = obs_frontend_get_streaming_output();
if (streaming_output) {
obs_output_output_caption_text1(streaming_output, str.c_str());
obs_output_output_caption_text1(streaming_output, result.text.c_str());
obs_output_release(streaming_output);
}
}

if (gf->output_file_path != "" && !gf->text_source_name) {
// Write to file, do not append
std::ofstream output_file(gf->output_file_path, std::ios::out | std::ios::trunc);
output_file << str;
output_file.close();
// Check if we should save the sentence
if (gf->save_only_while_recording && !obs_frontend_recording_active()) {
// We are not recording, do not save the sentence to file
return;
}
if (!gf->save_srt) {
// Write raw sentence to file, do not append
std::ofstream output_file(gf->output_file_path,
std::ios::out | std::ios::trunc);
output_file << result.text << std::endl;
output_file.close();
} else {
obs_log(gf->log_level, "Saving sentence to file %s, sentence #%d",
gf->output_file_path.c_str(), gf->sentence_number);
// Append sentence to file in .srt format
std::ofstream output_file(gf->output_file_path,
std::ios::out | std::ios::app);
output_file << gf->sentence_number << std::endl;
// use the start and end timestamps to calculate the start and end time in srt format
auto format_ts_for_srt = [&output_file](uint64_t ts) {
uint64_t time_s = ts / 1000;
uint64_t time_m = time_s / 60;
uint64_t time_h = time_m / 60;
uint64_t time_ms_rem = ts % 1000;
uint64_t time_s_rem = time_s % 60;
uint64_t time_m_rem = time_m % 60;
uint64_t time_h_rem = time_h % 60;
output_file << std::setfill('0') << std::setw(2) << time_h_rem
<< ":" << std::setfill('0') << std::setw(2)
<< time_m_rem << ":" << std::setfill('0')
<< std::setw(2) << time_s_rem << ","
<< std::setfill('0') << std::setw(3) << time_ms_rem;
};
format_ts_for_srt(result.start_timestamp_ms);
output_file << " --> ";
format_ts_for_srt(result.end_timestamp_ms);
output_file << std::endl;

output_file << result.text << std::endl;
output_file << std::endl;
output_file.close();
gf->sentence_number++;
}
} else {
if (!gf->text_source_mutex) {
obs_log(LOG_ERROR, "text_source_mutex is null");
Expand Down Expand Up @@ -292,6 +346,13 @@ void transcription_filter_update(void *data, obs_data_t *s)
bool step_by_step_processing = obs_data_get_bool(s, "step_by_step_processing");
gf->step_size_msec = step_by_step_processing ? (int)obs_data_get_int(s, "step_size_msec")
: BUFFER_SIZE_MSEC;
gf->save_srt = obs_data_get_bool(s, "subtitle_save_srt");
gf->save_only_while_recording = obs_data_get_bool(s, "only_while_recording");
gf->rename_file_to_match_recording = obs_data_get_bool(s, "rename_file_to_match_recording");
// Get the current timestamp using the system clock
gf->start_timestamp_ms = now_ms();
gf->sentence_number = 1;
gf->process_while_muted = obs_data_get_bool(s, "process_while_muted");

obs_log(gf->log_level, "transcription_filter: update text source");
// update the text source
Expand Down Expand Up @@ -468,6 +529,11 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
? (int)obs_data_get_int(settings, "step_size_msec")
: BUFFER_SIZE_MSEC;
gf->log_level = (int)obs_data_get_int(settings, "log_level");
gf->save_srt = obs_data_get_bool(settings, "subtitle_save_srt");
gf->save_only_while_recording = obs_data_get_bool(settings, "only_while_recording");
gf->rename_file_to_match_recording =
obs_data_get_bool(settings, "rename_file_to_match_recording");
gf->process_while_muted = obs_data_get_bool(settings, "process_while_muted");

for (size_t i = 0; i < MAX_AUDIO_CHANNELS; i++) {
circlebuf_init(&gf->input_buffers[i]);
Expand Down Expand Up @@ -525,6 +591,48 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)

gf->active = true;

// handle the event OBS_FRONTEND_EVENT_RECORDING_STARTING to reset the srt sentence number
// to match the subtitles with the recording
obs_frontend_add_event_callback(
[](enum obs_frontend_event event, void *private_data) {
if (event == OBS_FRONTEND_EVENT_RECORDING_STARTING) {
struct transcription_filter_data *gf_ =
static_cast<struct transcription_filter_data *>(
private_data);
if (gf_->save_srt && gf_->save_only_while_recording) {
obs_log(gf_->log_level,
"Recording started. Resetting srt file.");
// truncate file if it exists
std::ofstream output_file(gf_->output_file_path,
std::ios::out | std::ios::trunc);
output_file.close();
gf_->sentence_number = 1;
gf_->start_timestamp_ms = now_ms();
}
} else if (event == OBS_FRONTEND_EVENT_RECORDING_STOPPED) {
struct transcription_filter_data *gf_ =
static_cast<struct transcription_filter_data *>(
private_data);
if (gf_->save_srt && gf_->save_only_while_recording &&
gf_->rename_file_to_match_recording) {
obs_log(gf_->log_level,
"Recording stopped. Rename srt file.");
// rename file to match the recording file name with .srt extension
// use obs_frontend_get_last_recording to get the last recording file name
std::string recording_file_name =
obs_frontend_get_last_recording();
// remove the extension
recording_file_name = recording_file_name.substr(
0, recording_file_name.find_last_of("."));
std::string srt_file_name = recording_file_name + ".srt";
// rename the file
std::rename(gf_->output_file_path.c_str(),
srt_file_name.c_str());
}
}
},
gf);

obs_log(gf->log_level, "transcription_filter: filter created.");
return gf;
}
Expand Down Expand Up @@ -557,6 +665,10 @@ void transcription_filter_defaults(obs_data_t *s)
obs_data_set_default_string(s, "whisper_language_select", "en");
obs_data_set_default_string(s, "subtitle_sources", "none");
obs_data_set_default_bool(s, "step_by_step_processing", false);
obs_data_set_default_bool(s, "process_while_muted", false);
obs_data_set_default_bool(s, "subtitle_save_srt", false);
obs_data_set_default_bool(s, "only_while_recording", false);
obs_data_set_default_bool(s, "rename_file_to_match_recording", true);
obs_data_set_default_int(s, "step_size_msec", 1000);

// Whisper parameters
Expand Down Expand Up @@ -617,6 +729,7 @@ obs_properties_t *transcription_filter_properties(void *data)
return true;
});

obs_properties_add_bool(ppts, "process_while_muted", MT_("process_while_muted"));
obs_property_t *subs_output =
obs_properties_add_list(ppts, "subtitle_sources", MT_("subtitle_sources"),
OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING);
Expand All @@ -628,21 +741,25 @@ obs_properties_t *transcription_filter_properties(void *data)

obs_properties_add_path(ppts, "subtitle_output_filename", MT_("output_filename"),
OBS_PATH_FILE_SAVE, "Text (*.txt)", NULL);
obs_properties_add_bool(ppts, "subtitle_save_srt", MT_("save_srt"));
obs_properties_add_bool(ppts, "only_while_recording", MT_("only_while_recording"));
obs_properties_add_bool(ppts, "rename_file_to_match_recording",
MT_("rename_file_to_match_recording"));

obs_property_set_modified_callback(subs_output, [](obs_properties_t *props,
obs_property_t *property,
obs_data_t *settings) {
UNUSED_PARAMETER(property);
// Show or hide the output filename selection input
const char *new_output = obs_data_get_string(settings, "subtitle_sources");
if (strcmp(new_output, "text_file") == 0) {
// Show the output filename selection input
obs_property_set_visible(
obs_properties_get(props, "subtitle_output_filename"), true);
} else {
// Hide the output filename selection input
obs_property_set_visible(
obs_properties_get(props, "subtitle_output_filename"), false);
}
const bool show_hide = (strcmp(new_output, "text_file") == 0);
obs_property_set_visible(obs_properties_get(props, "subtitle_output_filename"),
show_hide);
obs_property_set_visible(obs_properties_get(props, "subtitle_save_srt"), show_hide);
obs_property_set_visible(obs_properties_get(props, "only_while_recording"),
show_hide);
obs_property_set_visible(
obs_properties_get(props, "rename_file_to_match_recording"), show_hide);
return true;
});

Expand Down
Loading

0 comments on commit b21a42a

Please sign in to comment.