Skip to content

Commit

Permalink
Offline test improvements (#150)
Browse files Browse the repository at this point in the history
* look at the front of the whisper buffer instead of the back

this should mostly not make a difference, but feels semantically
more correct

* Initialize `resampled_buffer` for offline tests

* Read relevant audio bytes

There are two issues here:
1. `line_size` may contain padding (didn't happen in my tests)
2. from: https://git.ffmpeg.org/gitweb/ffmpeg.git/blob/2b5f000d3f6f9e737e918a5438e6c881f65e70e2:/libavutil/frame.h#l405
> For audio, only linesize[0] may be set. For planar audio, each
> channel plane must be the same size.

* log running time in addition to local time

* Run whisper test "as fast as possible"

This kind of behaves like libobs, where each chunk of audio is
inspected individually by VAD/whisper, until processing of either
takes longer than the window length, in which case audio continues
to stream in

* Only ever send a single chunk of audio

* Add additional files to tests copy command

* Use condition variable to signal input thread if available

* Only wait in whisper thread if input buffers are empty
  • Loading branch information
palana authored Aug 9, 2024
1 parent 09839bb commit 6cc88b1
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 41 deletions.
3 changes: 2 additions & 1 deletion src/tests/audio-file-utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ read_audio_file(const char *filename, std::function<void(int, int)> initializati
for (int j = 0; j < codecContext->channels; j++) {
buffer[j].insert(buffer[j].end(), frame->data[j],
frame->data[j] +
frame->linesize[0]);
frame->nb_samples *
sizeof(float));
}
}
}
Expand Down
28 changes: 15 additions & 13 deletions src/tests/copy_dlls.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,23 @@ $obsDlls = @(
".\release\Release\obs-plugins\64bit\onnxruntime_providers_shared.dll",
".\release\Release\obs-plugins\64bit\onnxruntime.dll",
".\release\Release\obs-plugins\64bit\whisper.dll",
".deps\obs-deps-2023-11-03-x64\bin\avcodec-60.dll",
".deps\obs-deps-2023-11-03-x64\bin\avdevice-60.dll",
".deps\obs-deps-2023-11-03-x64\bin\avfilter-9.dll",
".deps\obs-deps-2023-11-03-x64\bin\avformat-60.dll",
".deps\obs-deps-2023-11-03-x64\bin\avutil-58.dll",
".deps\obs-deps-2023-11-03-x64\bin\libx264-164.dll",
".deps\obs-deps-2023-11-03-x64\bin\swresample-4.dll",
".deps\obs-deps-2023-11-03-x64\bin\swscale-7.dll",
".deps\obs-deps-2023-11-03-x64\bin\zlib.dll"
".deps\obs-studio-30.0.2\build_x64\rundir\Debug\bin\64bit\obs-frontend-api.dll",
".deps\obs-studio-30.0.2\build_x64\rundir\Debug\bin\64bit\obs.dll",
".deps\obs-studio-30.0.2\build_x64\rundir\Debug\bin\64bit\w32-pthreads.dll"
".\release\Release\obs-plugins\64bit\ggml.dll",
".deps\obs-deps-2024-03-19-x64\bin\avcodec-60.dll",
".deps\obs-deps-2024-03-19-x64\bin\avdevice-60.dll",
".deps\obs-deps-2024-03-19-x64\bin\avfilter-9.dll",
".deps\obs-deps-2024-03-19-x64\bin\avformat-60.dll",
".deps\obs-deps-2024-03-19-x64\bin\avutil-58.dll",
".deps\obs-deps-2024-03-19-x64\bin\libx264-164.dll",
".deps\obs-deps-2024-03-19-x64\bin\swresample-4.dll",
".deps\obs-deps-2024-03-19-x64\bin\swscale-7.dll",
".deps\obs-deps-2024-03-19-x64\bin\zlib.dll"
".deps\obs-deps-2024-03-19-x64\bin\librist.dll"
".deps\obs-deps-2024-03-19-x64\bin\srt.dll"
".deps\obs-studio-30.1.2\build_x64\rundir\Debug\bin\64bit\obs-frontend-api.dll",
".deps\obs-studio-30.1.2\build_x64\rundir\Debug\bin\64bit\obs.dll",
".deps\obs-studio-30.1.2\build_x64\rundir\Debug\bin\64bit\w32-pthreads.dll"
)

$obsDlls | ForEach-Object {
Copy-Item -Force -Path $_ -Destination $testToolPath
}

74 changes: 52 additions & 22 deletions src/tests/localvocal-offline-test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@

void obs_log(int log_level, const char *format, ...)
{
static auto start = std::chrono::system_clock::now();
if (log_level == LOG_DEBUG) {
return;
}
Expand All @@ -43,9 +44,14 @@ void obs_log(int log_level, const char *format, ...)
std::time_t now_time_t = std::chrono::system_clock::to_time_t(now);
std::tm now_tm = *std::localtime(&now_time_t);

auto diff = now - start;

// print timestamp
printf("[%02d:%02d:%02d.%03d] ", now_tm.tm_hour, now_tm.tm_min, now_tm.tm_sec,
(int)(epoch.count() % 1000));
printf("[%02d:%02d:%02d.%03d] [%02d:%02lld.%03lld] ", now_tm.tm_hour, now_tm.tm_min,
now_tm.tm_sec, (int)(epoch.count() % 1000),
std::chrono::duration_cast<std::chrono::minutes>(diff).count(),
std::chrono::duration_cast<std::chrono::seconds>(diff).count() % 60,
std::chrono::duration_cast<std::chrono::milliseconds>(diff).count() % 1000);

// print log level
switch (log_level) {
Expand Down Expand Up @@ -95,12 +101,14 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
gf->process_while_muted = false;
gf->buffered_output = false;
gf->fix_utf8 = true;
gf->input_cv.emplace();

for (size_t i = 0; i < gf->channels; i++) {
circlebuf_init(&gf->input_buffers[i]);
}
circlebuf_init(&gf->info_buffer);
circlebuf_init(&gf->whisper_buffer);
circlebuf_init(&gf->resampled_buffer);

// allocate copy buffers
gf->copy_buffers[0] =
Expand Down Expand Up @@ -307,6 +315,7 @@ void release_context(transcription_filter_data *gf)
}
circlebuf_free(&gf->info_buffer);
circlebuf_free(&gf->whisper_buffer);
circlebuf_free(&gf->resampled_buffer);

delete gf;
}
Expand Down Expand Up @@ -420,19 +429,23 @@ int wmain(int argc, wchar_t *argv[])
std::remove("segments.json");
}

const auto window_size_in_ms = std::chrono::milliseconds(25);

// fill up the whisper buffer
{
gf->start_timestamp_ms = now_ms();

obs_log(LOG_INFO, "Sending samples to whisper buffer");
// 25 ms worth of frames
int frames = gf->sample_rate * 25 / 1000;
int frames = gf->sample_rate * window_size_in_ms.count() / 1000;
const int frame_size_bytes = sizeof(float);
int frames_size_bytes = frames * frame_size_bytes;
int frames_count = 0;
int64_t start_time = std::chrono::duration_cast<std::chrono::nanoseconds>(
std::chrono::system_clock::now().time_since_epoch())
.count();
auto start_time_time = std::chrono::system_clock::now();
uint64_t window_number = 0;
while (true) {
// check if there are enough frames left in the audio buffer
if ((frames_count + frames) > (audio[0].size() / frame_size_bytes)) {
Expand All @@ -441,31 +454,48 @@ int wmain(int argc, wchar_t *argv[])
frames_size_bytes = frames * frame_size_bytes;
}
{
std::lock_guard<std::mutex> lock(gf->whisper_buf_mutex);

// push back current audio data to input circlebuf
for (size_t c = 0; c < gf->channels; c++) {
circlebuf_push_back(&gf->input_buffers[c],
audio[c].data() +
frames_count * frame_size_bytes,
frames_size_bytes);
{
auto max_wait = start_time_time +
(window_number * window_size_in_ms);
std::unique_lock<std::mutex> lock(gf->whisper_buf_mutex);
for (;;) {
// sleep up to window size in case whisper is processing, so the buffer builds up similar to OBS
auto now = std::chrono::system_clock::now();
if (false && now > max_wait)
break;

gf->input_cv->wait_for(
lock, std::chrono::milliseconds(10), [&] {
return gf->input_buffers->size == 0;
});
if (gf->input_buffers->size == 0)
break;
}
// push back current audio data to input circlebuf
for (size_t c = 0; c < gf->channels; c++) {
circlebuf_push_back(
&gf->input_buffers[c],
audio[c].data() +
frames_count * frame_size_bytes,
frames_size_bytes);
}
// push audio packet info (timestamp/frame count) to info circlebuf
struct transcription_filter_audio_info info = {0};
info.frames = frames; // number of frames in this packet
// make a timestamp from the current position in the audio buffer
info.timestamp_offset_ns =
start_time + (int64_t)(((float)frames_count /
(float)gf->sample_rate) *
1e9);
circlebuf_push_back(&gf->info_buffer, &info, sizeof(info));
}
// push audio packet info (timestamp/frame count) to info circlebuf
struct transcription_filter_audio_info info = {0};
info.frames = frames; // number of frames in this packet
// make a timestamp from the current position in the audio buffer
info.timestamp_offset_ns =
start_time +
(int64_t)(((float)frames_count / (float)gf->sample_rate) *
1e9);
circlebuf_push_back(&gf->info_buffer, &info, sizeof(info));
gf->wshiper_thread_cv.notify_one();
}
frames_count += frames;
window_number += 1;
if (frames_count >= audio[0].size() / frame_size_bytes) {
break;
}
// sleep for 25 ms
std::this_thread::sleep_for(std::chrono::milliseconds(25));
}
// push a second of silence to the input circlebuf
frames = 2 * gf->sample_rate;
Expand Down
1 change: 1 addition & 0 deletions src/transcription-filter-data.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ struct transcription_filter_data {
std::mutex whisper_buf_mutex;
std::mutex whisper_ctx_mutex;
std::condition_variable wshiper_thread_cv;
std::optional<std::condition_variable> input_cv;

// translation context
struct translation_context translation_ctx;
Expand Down
15 changes: 10 additions & 5 deletions src/whisper-utils/whisper-processing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -305,11 +305,11 @@ void run_inference_and_callbacks(transcription_filter_data *gf, uint64_t start_o
float *pcm32f_data = (float *)bzalloc(pcm32f_size_with_silence * sizeof(float));
if (vad_state == VAD_STATE_PARTIAL) {
// peek instead of pop, since this is a partial run that keeps the data in the buffer
circlebuf_peek_back(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100,
pcm32f_size * sizeof(float));
circlebuf_peek_front(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100,
pcm32f_size * sizeof(float));
} else {
circlebuf_pop_back(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100,
pcm32f_size * sizeof(float));
circlebuf_pop_front(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100,
pcm32f_size * sizeof(float));
}

struct DetectionResultWithText inference_result =
Expand Down Expand Up @@ -599,11 +599,16 @@ void whisper_loop(void *data)
}
}

if (gf->input_cv.has_value())
gf->input_cv->notify_one();

// Sleep using the condition variable wshiper_thread_cv
// This will wake up the thread if there is new data in the input buffer
// or if the whisper context is null
std::unique_lock<std::mutex> lock(gf->whisper_ctx_mutex);
gf->wshiper_thread_cv.wait_for(lock, std::chrono::milliseconds(50));
if (gf->input_buffers->size == 0) {
gf->wshiper_thread_cv.wait_for(lock, std::chrono::milliseconds(50));
}
}

obs_log(gf->log_level, "Exiting whisper thread");
Expand Down

0 comments on commit 6cc88b1

Please sign in to comment.