From 5650d69c549e488799406736ac491512bf4699c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sascha=20N=C3=B6sberger?= Date: Thu, 19 Dec 2024 09:01:12 +0100 Subject: [PATCH] Add support for faster-whisper multilingual option The faster-whisper multilingual option allows language detection to be performed on each segment. With the multilingual option enabled, you can get better transcription results if the language was incorrectly detected in the first place, or if the spoken language is switched in the audio. --- src/whisper_ctranslate2/commandline.py | 7 +++++++ src/whisper_ctranslate2/transcribe.py | 2 ++ src/whisper_ctranslate2/whisper_ctranslate2.py | 1 + 3 files changed, 10 insertions(+) diff --git a/src/whisper_ctranslate2/commandline.py b/src/whisper_ctranslate2/commandline.py index be28b90..9b8dc12 100644 --- a/src/whisper_ctranslate2/commandline.py +++ b/src/whisper_ctranslate2/commandline.py @@ -364,6 +364,13 @@ def read_command_line(): help="When using Batched transcription the maximum number of parallel requests to model for decoding.", ) + algorithm_args.add_argument( + "--multilingual", + type=CommandLine._str2bool, + default=False, + help="Perform language detection on every segment", + ) + vad_args = parser.add_argument_group("VAD filter arguments") vad_args.add_argument( diff --git a/src/whisper_ctranslate2/transcribe.py b/src/whisper_ctranslate2/transcribe.py index b8a4c24..6392bb8 100644 --- a/src/whisper_ctranslate2/transcribe.py +++ b/src/whisper_ctranslate2/transcribe.py @@ -52,6 +52,7 @@ class TranscriptionOptions(NamedTuple): vad_min_speech_duration_ms: Optional[int] vad_max_speech_duration_s: Optional[int] vad_min_silence_duration_ms: Optional[int] + multilingual: bool class Transcribe: @@ -179,6 +180,7 @@ def inference( vad_filter=vad, vad_parameters=vad_parameters, **batch_size, + multilingual=options.multilingual, ) language_name = LANGUAGES[info.language].title() diff --git a/src/whisper_ctranslate2/whisper_ctranslate2.py b/src/whisper_ctranslate2/whisper_ctranslate2.py index c71b7d8..bd5bf70 100644 --- a/src/whisper_ctranslate2/whisper_ctranslate2.py +++ b/src/whisper_ctranslate2/whisper_ctranslate2.py @@ -74,6 +74,7 @@ def get_transcription_options(args): vad_min_speech_duration_ms=args.pop("vad_min_speech_duration_ms"), vad_max_speech_duration_s=args.pop("vad_max_speech_duration_s"), vad_min_silence_duration_ms=args.pop("vad_min_silence_duration_ms"), + multilingual=args.pop("multilingual"), )