Offline test improvements (#150)

* look at the front of the whisper buffer instead of the back this should mostly not make a difference, but feels semantically more correct * Initialize `resampled_buffer` for offline tests * Read relevant audio bytes There are two issues here: 1. `line_size` may contain padding (didn't happen in my tests) 2. from: https://git.ffmpeg.org/gitweb/ffmpeg.git/blob/2b5f000d3f6f9e737e918a5438e6c881f65e70e2:/libavutil/frame.h#l405 > For audio, only linesize[0] may be set. For planar audio, each > channel plane must be the same size. * log running time in addition to local time * Run whisper test "as fast as possible" This kind of behaves like libobs, where each chunk of audio is inspected individually by VAD/whisper, until processing of either takes longer than the window length, in which case audio continues to stream in * Only ever send a single chunk of audio * Add additional files to tests copy command * Use condition variable to signal input thread if available * Only wait in whisper thread if input buffers are empty
locaal-ai · Aug 9, 2024 · 6cc88b1 · 6cc88b1
1 parent 09839bb
commit 6cc88b1
Show file tree

Hide file tree

Showing 5 changed files with 80 additions and 41 deletions.
diff --git a/src/tests/audio-file-utils.cpp b/src/tests/audio-file-utils.cpp
@@ -108,7 +108,8 @@ read_audio_file(const char *filename, std::function<void(int, int)> initializati
 					for (int j = 0; j < codecContext->channels; j++) {
 						buffer[j].insert(buffer[j].end(), frame->data[j],
 								 frame->data[j] +
-									 frame->linesize[0]);
+									 frame->nb_samples *
+										 sizeof(float));
 					}
 				}
 			}

diff --git a/src/tests/copy_dlls.ps1 b/src/tests/copy_dlls.ps1
@@ -20,21 +20,23 @@ $obsDlls = @(
     ".\release\Release\obs-plugins\64bit\onnxruntime_providers_shared.dll",
     ".\release\Release\obs-plugins\64bit\onnxruntime.dll",
     ".\release\Release\obs-plugins\64bit\whisper.dll",
-    ".deps\obs-deps-2023-11-03-x64\bin\avcodec-60.dll",
-    ".deps\obs-deps-2023-11-03-x64\bin\avdevice-60.dll",
-    ".deps\obs-deps-2023-11-03-x64\bin\avfilter-9.dll",
-    ".deps\obs-deps-2023-11-03-x64\bin\avformat-60.dll",
-    ".deps\obs-deps-2023-11-03-x64\bin\avutil-58.dll",
-    ".deps\obs-deps-2023-11-03-x64\bin\libx264-164.dll",
-    ".deps\obs-deps-2023-11-03-x64\bin\swresample-4.dll",
-    ".deps\obs-deps-2023-11-03-x64\bin\swscale-7.dll",
-    ".deps\obs-deps-2023-11-03-x64\bin\zlib.dll"
-    ".deps\obs-studio-30.0.2\build_x64\rundir\Debug\bin\64bit\obs-frontend-api.dll",
-    ".deps\obs-studio-30.0.2\build_x64\rundir\Debug\bin\64bit\obs.dll",
-    ".deps\obs-studio-30.0.2\build_x64\rundir\Debug\bin\64bit\w32-pthreads.dll"
+    ".\release\Release\obs-plugins\64bit\ggml.dll",
+    ".deps\obs-deps-2024-03-19-x64\bin\avcodec-60.dll",
+    ".deps\obs-deps-2024-03-19-x64\bin\avdevice-60.dll",
+    ".deps\obs-deps-2024-03-19-x64\bin\avfilter-9.dll",
+    ".deps\obs-deps-2024-03-19-x64\bin\avformat-60.dll",
+    ".deps\obs-deps-2024-03-19-x64\bin\avutil-58.dll",
+    ".deps\obs-deps-2024-03-19-x64\bin\libx264-164.dll",
+    ".deps\obs-deps-2024-03-19-x64\bin\swresample-4.dll",
+    ".deps\obs-deps-2024-03-19-x64\bin\swscale-7.dll",
+    ".deps\obs-deps-2024-03-19-x64\bin\zlib.dll"
+    ".deps\obs-deps-2024-03-19-x64\bin\librist.dll"
+    ".deps\obs-deps-2024-03-19-x64\bin\srt.dll"
+    ".deps\obs-studio-30.1.2\build_x64\rundir\Debug\bin\64bit\obs-frontend-api.dll",
+    ".deps\obs-studio-30.1.2\build_x64\rundir\Debug\bin\64bit\obs.dll",
+    ".deps\obs-studio-30.1.2\build_x64\rundir\Debug\bin\64bit\w32-pthreads.dll"
 )
 
 $obsDlls | ForEach-Object {
     Copy-Item -Force -Path $_ -Destination $testToolPath
 }
-
diff --git a/src/tests/localvocal-offline-test.cpp b/src/tests/localvocal-offline-test.cpp
@@ -31,6 +31,7 @@
 
 void obs_log(int log_level, const char *format, ...)
 {
+	static auto start = std::chrono::system_clock::now();
 	if (log_level == LOG_DEBUG) {
 		return;
 	}
@@ -43,9 +44,14 @@ void obs_log(int log_level, const char *format, ...)
 	std::time_t now_time_t = std::chrono::system_clock::to_time_t(now);
 	std::tm now_tm = *std::localtime(&now_time_t);
 
+	auto diff = now - start;
+
 	// print timestamp
-	printf("[%02d:%02d:%02d.%03d] ", now_tm.tm_hour, now_tm.tm_min, now_tm.tm_sec,
-	       (int)(epoch.count() % 1000));
+	printf("[%02d:%02d:%02d.%03d] [%02d:%02lld.%03lld] ", now_tm.tm_hour, now_tm.tm_min,
+	       now_tm.tm_sec, (int)(epoch.count() % 1000),
+	       std::chrono::duration_cast<std::chrono::minutes>(diff).count(),
+	       std::chrono::duration_cast<std::chrono::seconds>(diff).count() % 60,
+	       std::chrono::duration_cast<std::chrono::milliseconds>(diff).count() % 1000);
 
 	// print log level
 	switch (log_level) {
@@ -95,12 +101,14 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
 	gf->process_while_muted = false;
 	gf->buffered_output = false;
 	gf->fix_utf8 = true;
+	gf->input_cv.emplace();
 
 	for (size_t i = 0; i < gf->channels; i++) {
 		circlebuf_init(&gf->input_buffers[i]);
 	}
 	circlebuf_init(&gf->info_buffer);
 	circlebuf_init(&gf->whisper_buffer);
+	circlebuf_init(&gf->resampled_buffer);
 
 	// allocate copy buffers
 	gf->copy_buffers[0] =
@@ -307,6 +315,7 @@ void release_context(transcription_filter_data *gf)
 	}
 	circlebuf_free(&gf->info_buffer);
 	circlebuf_free(&gf->whisper_buffer);
+	circlebuf_free(&gf->resampled_buffer);
 
 	delete gf;
 }
@@ -420,19 +429,23 @@ int wmain(int argc, wchar_t *argv[])
 		std::remove("segments.json");
 	}
 
+	const auto window_size_in_ms = std::chrono::milliseconds(25);
+
 	// fill up the whisper buffer
 	{
 		gf->start_timestamp_ms = now_ms();
 
 		obs_log(LOG_INFO, "Sending samples to whisper buffer");
 		// 25 ms worth of frames
-		int frames = gf->sample_rate * 25 / 1000;
+		int frames = gf->sample_rate * window_size_in_ms.count() / 1000;
 		const int frame_size_bytes = sizeof(float);
 		int frames_size_bytes = frames * frame_size_bytes;
 		int frames_count = 0;
 		int64_t start_time = std::chrono::duration_cast<std::chrono::nanoseconds>(
 					     std::chrono::system_clock::now().time_since_epoch())
 					     .count();
+		auto start_time_time = std::chrono::system_clock::now();
+		uint64_t window_number = 0;
 		while (true) {
 			// check if there are enough frames left in the audio buffer
 			if ((frames_count + frames) > (audio[0].size() / frame_size_bytes)) {
@@ -441,31 +454,48 @@ int wmain(int argc, wchar_t *argv[])
 				frames_size_bytes = frames * frame_size_bytes;
 			}
 			{
-				std::lock_guard<std::mutex> lock(gf->whisper_buf_mutex);
-
-				// push back current audio data to input circlebuf
-				for (size_t c = 0; c < gf->channels; c++) {
-					circlebuf_push_back(&gf->input_buffers[c],
-							    audio[c].data() +
-								    frames_count * frame_size_bytes,
-							    frames_size_bytes);
+				{
+					auto max_wait = start_time_time +
+							(window_number * window_size_in_ms);
+					std::unique_lock<std::mutex> lock(gf->whisper_buf_mutex);
+					for (;;) {
+						// sleep up to window size in case whisper is processing, so the buffer builds up similar to OBS
+						auto now = std::chrono::system_clock::now();
+						if (false && now > max_wait)
+							break;
+
+						gf->input_cv->wait_for(
+							lock, std::chrono::milliseconds(10), [&] {
+								return gf->input_buffers->size == 0;
+							});
+						if (gf->input_buffers->size == 0)
+							break;
+					}
+					// push back current audio data to input circlebuf
+					for (size_t c = 0; c < gf->channels; c++) {
+						circlebuf_push_back(
+							&gf->input_buffers[c],
+							audio[c].data() +
+								frames_count * frame_size_bytes,
+							frames_size_bytes);
+					}
+					// push audio packet info (timestamp/frame count) to info circlebuf
+					struct transcription_filter_audio_info info = {0};
+					info.frames = frames; // number of frames in this packet
+					// make a timestamp from the current position in the audio buffer
+					info.timestamp_offset_ns =
+						start_time + (int64_t)(((float)frames_count /
+									(float)gf->sample_rate) *
+								       1e9);
+					circlebuf_push_back(&gf->info_buffer, &info, sizeof(info));
 				}
-				// push audio packet info (timestamp/frame count) to info circlebuf
-				struct transcription_filter_audio_info info = {0};
-				info.frames = frames; // number of frames in this packet
-				// make a timestamp from the current position in the audio buffer
-				info.timestamp_offset_ns =
-					start_time +
-					(int64_t)(((float)frames_count / (float)gf->sample_rate) *
-						  1e9);
-				circlebuf_push_back(&gf->info_buffer, &info, sizeof(info));
+				gf->wshiper_thread_cv.notify_one();
 			}
 			frames_count += frames;
+			window_number += 1;
 			if (frames_count >= audio[0].size() / frame_size_bytes) {
 				break;
 			}
-			// sleep for 25 ms
-			std::this_thread::sleep_for(std::chrono::milliseconds(25));
 		}
 		// push a second of silence to the input circlebuf
 		frames = 2 * gf->sample_rate;

diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h
@@ -104,6 +104,7 @@ struct transcription_filter_data {
 	std::mutex whisper_buf_mutex;
 	std::mutex whisper_ctx_mutex;
 	std::condition_variable wshiper_thread_cv;
+	std::optional<std::condition_variable> input_cv;
 
 	// translation context
 	struct translation_context translation_ctx;

diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp
@@ -305,11 +305,11 @@ void run_inference_and_callbacks(transcription_filter_data *gf, uint64_t start_o
 	float *pcm32f_data = (float *)bzalloc(pcm32f_size_with_silence * sizeof(float));
 	if (vad_state == VAD_STATE_PARTIAL) {
 		// peek instead of pop, since this is a partial run that keeps the data in the buffer
-		circlebuf_peek_back(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100,
-				    pcm32f_size * sizeof(float));
+		circlebuf_peek_front(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100,
+				     pcm32f_size * sizeof(float));
 	} else {
-		circlebuf_pop_back(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100,
-				   pcm32f_size * sizeof(float));
+		circlebuf_pop_front(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100,
+				    pcm32f_size * sizeof(float));
 	}
 
 	struct DetectionResultWithText inference_result =
@@ -599,11 +599,16 @@ void whisper_loop(void *data)
 			}
 		}
 
+		if (gf->input_cv.has_value())
+			gf->input_cv->notify_one();
+
 		// Sleep using the condition variable wshiper_thread_cv
 		// This will wake up the thread if there is new data in the input buffer
 		// or if the whisper context is null
 		std::unique_lock<std::mutex> lock(gf->whisper_ctx_mutex);
-		gf->wshiper_thread_cv.wait_for(lock, std::chrono::milliseconds(50));
+		if (gf->input_buffers->size == 0) {
+			gf->wshiper_thread_cv.wait_for(lock, std::chrono::milliseconds(50));
+		}
 	}
 
 	obs_log(gf->log_level, "Exiting whisper thread");