diff --git a/.gitignore b/.gitignore index 540c1326..105682fd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ whisperx.egg-info/ **/__pycache__/ .ipynb_checkpoints +build/ +dist/ +.vscode/ diff --git a/README.md b/README.md index 32e86665..f3c5abce 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,20 @@ This repository provides fast automatic speech recognition (70x realtime with la - 👯‍♂️ Multispeaker ASR using speaker diarization from [pyannote-audio](https://github.com/pyannote/pyannote-audio) (speaker ID labels) - 🗣️ VAD preprocessing, reduces hallucination & batching with no WER degradation - +> #### WhisperX improvements +> +> This repository contains some improvements to WhisperX, necessary for my research. Ideally, these would be merged into the original repository. That repository seems unmaintained, and doesn't accept pull requests. You can track the [open pull request here](https://github.com/m-bain/whisperX/pull/900). +> +> - Silero VAD added from #888 +> - Diarization improvements from #590 +> - Unique speakers added to result (inspiration from #126) +> - **Option to detect language per segment, very useful for longer audio with frequent language switches.** +> - Changed `setup.py` to `pyproject.toml` +> - Added VAD min duration on and off parameters to PyAnnote. The current implementation splits even on sub> -second pauses which is rather ineffective sometimes. +> - Pyannote.audio bumped to 3.3.2 +> +> **Multi-language audio handling** +> Greatly improves by adding the argument for language detection every 30 second chunk. Run it like this: `whisperx audio.mp3 --detect_language_per_segment` **Whisper** is an ASR model [developed by OpenAI](https://github.com/openai/whisper), trained on a large dataset of diverse audio. Whilst it does produces highly accurate transcriptions, the corresponding timestamps are at the utterance-level, not per word, and can be inaccurate by several seconds. OpenAI's whisper does not natively support batching. @@ -82,15 +95,15 @@ See other methods [here.](https://pytorch.org/get-started/previous-versions/#v20 ### 3. Install this repo -`pip install git+https://github.com/m-bain/whisperx.git` +`pip install git+https://github.com/cvl01/whisperx.git` If already installed, update package to most recent commit -`pip install git+https://github.com/m-bain/whisperx.git --upgrade` +`pip install git+https://github.com/cvl01/whisperx.git --upgrade` If wishing to modify this package, clone and install in editable mode: ``` -$ git clone https://github.com/m-bain/whisperX.git +$ git clone https://github.com/cvl01/whisperX.git $ cd whisperX $ pip install -e . ``` @@ -259,7 +272,7 @@ Bug finding and pull requests are also highly appreciated to keep this project g * [ ] Add benchmarking code (TEDLIUM for spd/WER & word segmentation) -* [ ] Allow silero-vad as alternative VAD option +* [x] Allow silero-vad as alternative VAD option * [ ] Improve diarization (word level). *Harder than first thought...* @@ -281,7 +294,9 @@ Borrows important alignment code from [PyTorch tutorial on forced alignment](htt And uses the wonderful pyannote VAD / Diarization https://github.com/pyannote/pyannote-audio -Valuable VAD & Diarization Models from [pyannote audio][https://github.com/pyannote/pyannote-audio] +Valuable VAD & Diarization Models from: +- [pyannote audio][https://github.com/pyannote/pyannote-audio] +- [silero vad][https://github.com/snakers4/silero-vad] Great backend from [faster-whisper](https://github.com/guillaumekln/faster-whisper) and [CTranslate2](https://github.com/OpenNMT/CTranslate2) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..1fd8530b --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,36 @@ +[build-system] +requires = ["setuptools>=65", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "whisperx" +version = "3.1.1" +description = "Time-Accurate Automatic Speech Recognition using Whisper." +readme = "README.md" +requires-python = ">=3.9" +authors = [ + {name = "Max Bain"} +] +license = {text = "MIT"} +dependencies = [ + "torch>=2", + "torchaudio>=2", + "faster-whisper==1.0.3", + "transformers", + "pandas", + "setuptools>=65", + "nltk", + "pyannote.audio==3.3.2" +] + +[project.optional-dependencies] +dev = ["pytest"] + +[project.scripts] +whisperx = "whisperx.transcribe:cli" + +[tool.setuptools.packages.find] +exclude = ["tests*"] + +[tool.setuptools] +include-package-data = true diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 865abd1f..00000000 --- a/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -torch>=2 -torchaudio>=2 -faster-whisper==1.0.0 -transformers -pandas -setuptools>=65 -nltk diff --git a/setup.py b/setup.py deleted file mode 100644 index 40db6cc9..00000000 --- a/setup.py +++ /dev/null @@ -1,30 +0,0 @@ -import os -import platform - -import pkg_resources -from setuptools import find_packages, setup - -setup( - name="whisperx", - py_modules=["whisperx"], - version="3.1.1", - description="Time-Accurate Automatic Speech Recognition using Whisper.", - readme="README.md", - python_requires=">=3.8", - author="Max Bain", - url="https://github.com/m-bain/whisperx", - license="MIT", - packages=find_packages(exclude=["tests*"]), - install_requires=[ - str(r) - for r in pkg_resources.parse_requirements( - open(os.path.join(os.path.dirname(__file__), "requirements.txt")) - ) - ] - + [f"pyannote.audio==3.1.1"], - entry_points={ - "console_scripts": ["whisperx=whisperx.transcribe:cli"], - }, - include_package_data=True, - extras_require={"dev": ["pytest"]}, -) diff --git a/whisperx/__init__.py b/whisperx/__init__.py index 20abaaed..92e6d424 100644 --- a/whisperx/__init__.py +++ b/whisperx/__init__.py @@ -1,4 +1,4 @@ -from .transcribe import load_model from .alignment import load_align_model, align from .audio import load_audio -from .diarize import assign_word_speakers, DiarizationPipeline \ No newline at end of file +from .diarize import assign_word_speakers, DiarizationPipeline +from .asr import load_model diff --git a/whisperx/alignment.py b/whisperx/alignment.py index 964217e2..bd813683 100644 --- a/whisperx/alignment.py +++ b/whisperx/alignment.py @@ -184,11 +184,13 @@ def align( t1 = segment["start"] t2 = segment["end"] text = segment["text"] + language = segment["language"] aligned_seg: SingleAlignedSegment = { "start": t1, "end": t2, "text": text, + "language": language, "words": [], } @@ -324,6 +326,7 @@ def align( "start": sentence_start, "end": sentence_end, "words": sentence_words, + "language": language }) if return_char_alignments: @@ -337,7 +340,7 @@ def align( aligned_subsegments["start"] = interpolate_nans(aligned_subsegments["start"], method=interpolate_method) aligned_subsegments["end"] = interpolate_nans(aligned_subsegments["end"], method=interpolate_method) # concatenate sentences with same timestamps - agg_dict = {"text": " ".join, "words": "sum"} + agg_dict = {"text": " ".join, "words": "sum", "language": "first"} if model_lang in LANGUAGES_WITHOUT_SPACES: agg_dict["text"] = "".join if return_char_alignments: diff --git a/whisperx/asr.py b/whisperx/asr.py index 0ccaf92b..fb92ad81 100644 --- a/whisperx/asr.py +++ b/whisperx/asr.py @@ -1,6 +1,6 @@ import os -import warnings -from typing import List, Union, Optional, NamedTuple +import re +from typing import List, NamedTuple, Optional, Union import ctranslate2 import faster_whisper @@ -9,9 +9,12 @@ from transformers import Pipeline from transformers.pipelines.pt_utils import PipelineIterator +from whisperx.utils import LANGUAGES +import whisperx.vads + from .audio import N_SAMPLES, SAMPLE_RATE, load_audio, log_mel_spectrogram -from .vad import load_vad_model, merge_chunks -from .types import TranscriptionResult, SingleSegment +from .types import SingleSegment, TranscriptionResult + def find_numeral_symbol_tokens(tokenizer): numeral_symbol_tokens = [] @@ -22,13 +25,23 @@ def find_numeral_symbol_tokens(tokenizer): numeral_symbol_tokens.append(i) return numeral_symbol_tokens + class WhisperModel(faster_whisper.WhisperModel): - ''' + """ FasterWhisperModel provides batched inference for faster-whisper. Currently only works in non-timestamp mode and fixed prompt for all samples in batch. - ''' + """ + + def generate_segment_batched( + self, + features: np.ndarray, + tokenizer: faster_whisper.tokenizer.Tokenizer, + options: faster_whisper.transcribe.TranscriptionOptions, + encoder_output=None, + detect_language_per_segment=False, + possible_languages: list[str] = ["<|en|>"], + ): - def generate_segment_batched(self, features: np.ndarray, tokenizer: faster_whisper.tokenizer.Tokenizer, options: faster_whisper.transcribe.TranscriptionOptions, encoder_output = None): batch_size = features.shape[0] all_tokens = [] prompt_reset_since = 0 @@ -37,33 +50,68 @@ def generate_segment_batched(self, features: np.ndarray, tokenizer: faster_whisp initial_prompt_tokens = tokenizer.encode(initial_prompt) all_tokens.extend(initial_prompt_tokens) previous_tokens = all_tokens[prompt_reset_since:] - prompt = self.get_prompt( - tokenizer, - previous_tokens, - without_timestamps=options.without_timestamps, - prefix=options.prefix, - ) encoder_output = self.encode(features) + def extract_language_code(text): + return re.search(r"<\|(.*?)\|>", text).group(1) + + prompts = None + if detect_language_per_segment: + + detected_languages = self.model.detect_language(encoder_output) + + detected_languages = [ + [x for x in y if x[0] in possible_languages] for y in detected_languages + ] + + detected_languages = [inner_list[0] for inner_list in detected_languages] + + lang_ids, lang_scores = zip(*detected_languages) + lang_ids = [extract_language_code(x) for x in lang_ids] + + prompts_per_lang = {} + + for lang_id in list(set(lang_ids)): + tokenizer.language = tokenizer.tokenizer.token_to_id("<|%s|>" % lang_id) + tokenizer.language_code = lang_id + prompts_per_lang[lang_id] = self.get_prompt( + tokenizer, + previous_tokens, + without_timestamps=options.without_timestamps, + prefix=options.prefix, + ) + + prompts = [prompts_per_lang[lang_id] for lang_id in lang_ids] + + else: + prompt = self.get_prompt( + tokenizer, + previous_tokens, + without_timestamps=options.without_timestamps, + prefix=options.prefix, + ) + lang_ids = [tokenizer.language_code] * batch_size + lang_scores = None + max_initial_timestamp_index = int( round(options.max_initial_timestamp / self.time_precision) ) result = self.model.generate( - encoder_output, - [prompt] * batch_size, - beam_size=options.beam_size, - patience=options.patience, - length_penalty=options.length_penalty, - max_length=self.max_length, - suppress_blank=options.suppress_blank, - suppress_tokens=options.suppress_tokens, - ) + encoder_output, + prompts or [prompt] * batch_size, + beam_size=options.beam_size, + patience=options.patience, + length_penalty=options.length_penalty, + max_length=self.max_length, + suppress_blank=options.suppress_blank, + suppress_tokens=options.suppress_tokens, + ) tokens_batch = [x.sequences_ids[0] for x in result] - def decode_batch(tokens: List[List[int]]) -> str: + def decode_batch(tokens: List[List[int]]) -> List[str]: res = [] for tk in tokens: res.append([token for token in tk if token < tokenizer.eot]) @@ -71,8 +119,7 @@ def decode_batch(tokens: List[List[int]]) -> str: return tokenizer.tokenizer.decode_batch(res) text = decode_batch(tokens_batch) - - return text + return text, lang_ids, lang_scores def encode(self, features: np.ndarray) -> ctranslate2.StorageView: # When the model is running on multiple GPUs, the encoder output should be moved @@ -85,26 +132,28 @@ def encode(self, features: np.ndarray) -> ctranslate2.StorageView: return self.model.encode(features, to_cpu=to_cpu) + class FasterWhisperPipeline(Pipeline): """ Huggingface Pipeline wrapper for FasterWhisperModel. """ + # TODO: # - add support for timestamp mode # - add support for custom inference kwargs def __init__( - self, - model, - vad, - vad_params: dict, - options : NamedTuple, - tokenizer=None, - device: Union[int, str, "torch.device"] = -1, - framework = "pt", - language : Optional[str] = None, - suppress_numerals: bool = False, - **kwargs + self, + model: WhisperModel, + vad, + vad_params: dict, + options: NamedTuple, + tokenizer=None, + device: Union[int, str, "torch.device"] = -1, + framework="pt", + language: Optional[str] = None, + suppress_numerals: bool = False, + **kwargs, ): self.model = model self.tokenizer = tokenizer @@ -113,7 +162,9 @@ def __init__( self.suppress_numerals = suppress_numerals self._batch_size = kwargs.pop("batch_size", None) self._num_workers = 1 - self._preprocess_params, self._forward_params, self._postprocess_params = self._sanitize_parameters(**kwargs) + self._preprocess_params, self._forward_params, self._postprocess_params = ( + self._sanitize_parameters(**kwargs) + ) self.call_count = 0 self.framework = framework if self.framework == "pt": @@ -139,24 +190,40 @@ def _sanitize_parameters(self, **kwargs): return preprocess_kwargs, {}, {} def preprocess(self, audio): - audio = audio['inputs'] + audio = audio["inputs"] model_n_mels = self.model.feat_kwargs.get("feature_size") features = log_mel_spectrogram( audio, n_mels=model_n_mels if model_n_mels is not None else 80, padding=N_SAMPLES - audio.shape[0], ) - return {'inputs': features} + return {"inputs": features} def _forward(self, model_inputs): - outputs = self.model.generate_segment_batched(model_inputs['inputs'], self.tokenizer, self.options) - return {'text': outputs} + outputs, lang_ids, lang_scores = self.model.generate_segment_batched( + model_inputs["inputs"], + self.tokenizer, + self.options, + detect_language_per_segment=self.detect_language_per_segment, + possible_languages=self.possible_languages, + ) + return { + "text": outputs, + "language": lang_ids, + "language_confidence": lang_scores, + } def postprocess(self, model_outputs): return model_outputs def get_iterator( - self, inputs, num_workers: int, batch_size: int, preprocess_params, forward_params, postprocess_params + self, + inputs, + num_workers: int, + batch_size: int, + preprocess_params, + forward_params, + postprocess_params, ): dataset = PipelineIterator(inputs, self.preprocess, preprocess_params) if "TOKENIZERS_PARALLELISM" not in os.environ: @@ -164,46 +231,98 @@ def get_iterator( # TODO hack by collating feature_extractor and image_processor def stack(items): - return {'inputs': torch.stack([x['inputs'] for x in items])} - dataloader = torch.utils.data.DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, collate_fn=stack) - model_iterator = PipelineIterator(dataloader, self.forward, forward_params, loader_batch_size=batch_size) - final_iterator = PipelineIterator(model_iterator, self.postprocess, postprocess_params) + return {"inputs": torch.stack([x["inputs"] for x in items])} + + dataloader = torch.utils.data.DataLoader( + dataset, num_workers=num_workers, batch_size=batch_size, collate_fn=stack + ) + model_iterator = PipelineIterator( + dataloader, self.forward, forward_params, loader_batch_size=batch_size + ) + final_iterator = PipelineIterator( + model_iterator, self.postprocess, postprocess_params + ) return final_iterator def transcribe( - self, audio: Union[str, np.ndarray], batch_size=None, num_workers=0, language=None, task=None, chunk_size=30, print_progress = False, combined_progress=False + self, + audio: Union[str, np.ndarray], + batch_size=None, + num_workers=0, + language=None, + task=None, + chunk_size=30, + print_progress=False, + combined_progress=False, + detect_language_per_segment=False, + possible_languages: List[str] = [], ) -> TranscriptionResult: if isinstance(audio, str): audio = load_audio(audio) def data(audio, segments): for seg in segments: - f1 = int(seg['start'] * SAMPLE_RATE) - f2 = int(seg['end'] * SAMPLE_RATE) + f1 = int(seg["start"] * SAMPLE_RATE) + f2 = int(seg["end"] * SAMPLE_RATE) # print(f2-f1) - yield {'inputs': audio[f1:f2]} + yield {"inputs": audio[f1:f2]} - vad_segments = self.vad_model({"waveform": torch.from_numpy(audio).unsqueeze(0), "sample_rate": SAMPLE_RATE}) + # Pre-process audio and merge chunks as defined by the respective VAD child class + # In case vad_model is manually assigned (see 'load_model') follow the functionality of pyannote toolkit + if issubclass(type(self.vad_model), whisperx.vads.Vad): + waveform = self.vad_model.preprocess_audio(audio) + merge_chunks = self.vad_model.merge_chunks + else: + waveform = whisperx.vads.Pyannote.preprocess_audio(audio) + merge_chunks = whisperx.vads.Pyannote.merge_chunks + + vad_segments = self.vad_model( + {"waveform": waveform, "sample_rate": SAMPLE_RATE} + ) vad_segments = merge_chunks( vad_segments, chunk_size, onset=self._vad_params["vad_onset"], offset=self._vad_params["vad_offset"], + min_duration_on=self._vad_params["vad_min_duration_on"], + min_duration_off=self._vad_params["vad_min_duration_off"], ) + + if not possible_languages: + possible_languages = LANGUAGES.keys() + + self.possible_languages = [f"<|{x}|>" for x in possible_languages] + self.detect_language_per_segment = detect_language_per_segment + if self.detect_language_per_segment and not language: + language = "en" + if self.tokenizer is None: - language = language or self.detect_language(audio) + if vad_segments: + first_segment = vad_segments[0] + start_sample = int(first_segment["start"] * SAMPLE_RATE) + segment_audio = audio[start_sample:] + else: + # Fall back to the first 30 seconds of audio if no VAD segments found + segment_audio = audio + language = language or self.detect_language(segment_audio) task = task or "transcribe" - self.tokenizer = faster_whisper.tokenizer.Tokenizer(self.model.hf_tokenizer, - self.model.model.is_multilingual, task=task, - language=language) + self.tokenizer = faster_whisper.tokenizer.Tokenizer( + self.model.hf_tokenizer, + self.model.model.is_multilingual, + task=task, + language=language, + ) else: language = language or self.tokenizer.language_code task = task or self.tokenizer.task if task != self.tokenizer.task or language != self.tokenizer.language_code: - self.tokenizer = faster_whisper.tokenizer.Tokenizer(self.model.hf_tokenizer, - self.model.model.is_multilingual, task=task, - language=language) - + self.tokenizer = faster_whisper.tokenizer.Tokenizer( + self.model.hf_tokenizer, + self.model.model.is_multilingual, + task=task, + language=language, + ) + if self.suppress_numerals: previous_suppress_tokens = self.options.suppress_tokens numeral_symbol_tokens = find_numeral_symbol_tokens(self.tokenizer) @@ -215,19 +334,30 @@ def data(audio, segments): segments: List[SingleSegment] = [] batch_size = batch_size or self._batch_size total_segments = len(vad_segments) - for idx, out in enumerate(self.__call__(data(audio, vad_segments), batch_size=batch_size, num_workers=num_workers)): + + for idx, out in enumerate( + self.__call__( + data(audio, vad_segments), + batch_size=batch_size, + num_workers=num_workers, + ) + ): if print_progress: base_progress = ((idx + 1) / total_segments) * 100 - percent_complete = base_progress / 2 if combined_progress else base_progress + percent_complete = ( + base_progress / 2 if combined_progress else base_progress + ) print(f"Progress: {percent_complete:.2f}%...") - text = out['text'] + text = out["text"] if batch_size in [0, 1, None]: text = text[0] segments.append( { "text": text, - "start": round(vad_segments[idx]['start'], 3), - "end": round(vad_segments[idx]['end'], 3) + "start": round(vad_segments[idx]["start"], 3), + "end": round(vad_segments[idx]["end"], 3), + "language": out["language"], + "language_confidence": out["language_confidence"], } ) @@ -237,42 +367,54 @@ def data(audio, segments): # revert suppressed tokens if suppress_numerals is enabled if self.suppress_numerals: - self.options = self.options._replace(suppress_tokens=previous_suppress_tokens) + self.options = self.options._replace( + suppress_tokens=previous_suppress_tokens + ) return {"segments": segments, "language": language} - def detect_language(self, audio: np.ndarray): if audio.shape[0] < N_SAMPLES: - print("Warning: audio is shorter than 30s, language detection may be inaccurate.") + print( + "Warning: audio is shorter than 30s, language detection may be inaccurate." + ) model_n_mels = self.model.feat_kwargs.get("feature_size") - segment = log_mel_spectrogram(audio[: N_SAMPLES], - n_mels=model_n_mels if model_n_mels is not None else 80, - padding=0 if audio.shape[0] >= N_SAMPLES else N_SAMPLES - audio.shape[0]) + segment = log_mel_spectrogram( + audio[:N_SAMPLES], + n_mels=model_n_mels if model_n_mels is not None else 80, + padding=0 if audio.shape[0] >= N_SAMPLES else N_SAMPLES - audio.shape[0], + ) encoder_output = self.model.encode(segment) results = self.model.model.detect_language(encoder_output) language_token, language_probability = results[0][0] language = language_token[2:-2] - print(f"Detected language: {language} ({language_probability:.2f}) in first 30s of audio...") + print( + f"Detected language: {language} ({language_probability:.2f}) in first 30s of audio..." + ) return language -def load_model(whisper_arch, - device, - device_index=0, - compute_type="float16", - asr_options=None, - language : Optional[str] = None, - vad_model=None, - vad_options=None, - model : Optional[WhisperModel] = None, - task="transcribe", - download_root=None, - threads=4): - '''Load a Whisper model for inference. + +def load_model( + whisper_arch, + device, + device_index=0, + compute_type="float16", + asr_options=None, + language: Optional[str] = None, + vad_model=None, + vad_method=None, + vad_options=None, + model: Optional[WhisperModel] = None, + task="transcribe", + download_root=None, + threads=4, +) -> FasterWhisperPipeline: + """Load a Whisper model for inference. Args: whisper_arch: str - The name of the Whisper model to load. device: str - The device to load the model on. compute_type: str - The compute type to use for the model. + vad_method: str - The vad method to use. vad_model has higher priority if is not None. options: dict - A dictionary of options to use for the model. language: str - The language of the model. (use English for now) model: Optional[WhisperModel] - The WhisperModel instance to use. @@ -280,24 +422,33 @@ def load_model(whisper_arch, threads: int - The number of cpu threads to use per worker, e.g. will be multiplied by num workers. Returns: A Whisper pipeline. - ''' + """ if whisper_arch.endswith(".en"): language = "en" - model = model or WhisperModel(whisper_arch, - device=device, - device_index=device_index, - compute_type=compute_type, - download_root=download_root, - cpu_threads=threads) + model = model or WhisperModel( + whisper_arch, + device=device, + device_index=device_index, + compute_type=compute_type, + download_root=download_root, + cpu_threads=threads, + ) if language is not None: - tokenizer = faster_whisper.tokenizer.Tokenizer(model.hf_tokenizer, model.model.is_multilingual, task=task, language=language) + tokenizer = faster_whisper.tokenizer.Tokenizer( + model.hf_tokenizer, + model.model.is_multilingual, + task=task, + language=language, + ) else: - print("No language specified, language will be first be detected for each audio file (increases inference time).") + print( + "No language specified, language will be first be detected for each audio file (increases inference time)." + ) tokenizer = None - default_asr_options = { + default_asr_options = { "beam_size": 5, "best_of": 5, "patience": 1, @@ -323,6 +474,7 @@ def load_model(whisper_arch, "max_new_tokens": None, "clip_timestamps": None, "hallucination_silence_threshold": None, + "hotwords": None, } if asr_options is not None: @@ -331,20 +483,33 @@ def load_model(whisper_arch, suppress_numerals = default_asr_options["suppress_numerals"] del default_asr_options["suppress_numerals"] - default_asr_options = faster_whisper.transcribe.TranscriptionOptions(**default_asr_options) + default_asr_options = faster_whisper.transcribe.TranscriptionOptions( + **default_asr_options + ) default_vad_options = { + "chunk_size": 30, # needed by silero since binarization happens before merge_chunks "vad_onset": 0.500, - "vad_offset": 0.363 + "vad_offset": 0.363, + "vad_min_duration_on": 0.0, + "vad_min_duration_off": 0.0, } if vad_options is not None: default_vad_options.update(vad_options) + # Note: manually assigned vad_model has higher priority than vad_method! if vad_model is not None: + print("Use manually assigned vad_model. vad_method is ignored.") vad_model = vad_model else: - vad_model = load_vad_model(torch.device(device), use_auth_token=None, **default_vad_options) + match vad_method: + case "silero": + vad_model = whisperx.vads.Silero(**default_vad_options) + case "pyannote" | _: + vad_model = whisperx.vads.Pyannote( + torch.device(device), use_auth_token=None, **default_vad_options + ) return FasterWhisperPipeline( model=model, diff --git a/whisperx/diarize.py b/whisperx/diarize.py index c327c932..88fdfaa1 100644 --- a/whisperx/diarize.py +++ b/whisperx/diarize.py @@ -33,20 +33,26 @@ def __call__(self, audio: Union[str, np.ndarray], num_speakers=None, min_speaker def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False): + unique_speakers = set() transcript_segments = transcript_result["segments"] for seg in transcript_segments: # assign speaker to segment (if any) diarize_df['intersection'] = np.minimum(diarize_df['end'], seg['end']) - np.maximum(diarize_df['start'], seg['start']) diarize_df['union'] = np.maximum(diarize_df['end'], seg['end']) - np.minimum(diarize_df['start'], seg['start']) # remove no hit, otherwise we look for closest (even negative intersection...) - if not fill_nearest: - dia_tmp = diarize_df[diarize_df['intersection'] > 0] - else: - dia_tmp = diarize_df - if len(dia_tmp) > 0: - # sum over speakers - speaker = dia_tmp.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0] + intersected = diarize_df[diarize_df["intersection"] > 0] + + speaker = None + if len(intersected) > 0: + # Choosing most strong intersection + speaker = intersected.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0] + elif fill_nearest: + # Otherwise choosing closest + speaker = diarize_df.sort_values(by=["intersection"], ascending=False)["speaker"].values[0] + + if speaker is not None: seg["speaker"] = speaker + unique_speakers.add(speaker) # assign speaker to words if 'words' in seg: @@ -54,16 +60,21 @@ def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False): if 'start' in word: diarize_df['intersection'] = np.minimum(diarize_df['end'], word['end']) - np.maximum(diarize_df['start'], word['start']) diarize_df['union'] = np.maximum(diarize_df['end'], word['end']) - np.minimum(diarize_df['start'], word['start']) - # remove no hit - if not fill_nearest: - dia_tmp = diarize_df[diarize_df['intersection'] > 0] - else: - dia_tmp = diarize_df - if len(dia_tmp) > 0: - # sum over speakers - speaker = dia_tmp.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0] - word["speaker"] = speaker - + intersected = diarize_df[diarize_df["intersection"] > 0] + + word_speaker = None + if len(intersected) > 0: + # Choosing most strong intersection + word_speaker = intersected.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0] + elif fill_nearest: + # Otherwise choosing closest + word_speaker = diarize_df.sort_values(by=["intersection"], ascending=False)["speaker"].values[0] + + if word_speaker is not None: + word["speaker"] = word_speaker + + transcript_result["speakers"] = list(unique_speakers) + return transcript_result diff --git a/whisperx/transcribe.py b/whisperx/transcribe.py index edd27648..bef3c1a1 100644 --- a/whisperx/transcribe.py +++ b/whisperx/transcribe.py @@ -39,6 +39,7 @@ def cli(): parser.add_argument("--return_char_alignments", action='store_true', help="Return character-level alignments in the output json file") # vad params + parser.add_argument("--vad_method", type=str, default="pyannote", choices=["pyannote", "silero"], help="VAD method to be used") parser.add_argument("--vad_onset", type=float, default=0.500, help="Onset threshold for VAD (see pyannote.audio), reduce this if speech is not being detected") parser.add_argument("--vad_offset", type=float, default=0.363, help="Offset threshold for VAD (see pyannote.audio), reduce this if speech is not being detected.") parser.add_argument("--chunk_size", type=int, default=30, help="Chunk size for merging VAD segments. Default is 30, reduce this if the chunk is too long.") @@ -56,6 +57,7 @@ def cli(): parser.add_argument("--suppress_tokens", type=str, default="-1", help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations") parser.add_argument("--suppress_numerals", action="store_true", help="whether to suppress numeric symbols and currency symbols during sampling, since wav2vec2 cannot align them correctly") + parser.add_argument("--detect_language_per_segment", action="store_true", help="Use this for multi language audio, if you want language detection to happen for every segment. ") parser.add_argument("--initial_prompt", type=str, default=None, help="optional text to provide as a prompt for the first window.") parser.add_argument("--condition_on_previous_text", type=str2bool, default=False, help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop") @@ -102,6 +104,7 @@ def cli(): return_char_alignments: bool = args.pop("return_char_alignments") hf_token: str = args.pop("hf_token") + vad_method: str = args.pop("vad_method") vad_onset: float = args.pop("vad_onset") vad_offset: float = args.pop("vad_offset") @@ -167,13 +170,14 @@ def cli(): results = [] tmp_results = [] # model = load_model(model_name, device=device, download_root=model_dir) - model = load_model(model_name, device=device, device_index=device_index, download_root=model_dir, compute_type=compute_type, language=args['language'], asr_options=asr_options, vad_options={"vad_onset": vad_onset, "vad_offset": vad_offset}, task=task, threads=faster_whisper_threads) + model = load_model(model_name, device=device, device_index=device_index, download_root=model_dir, compute_type=compute_type, language=args['language'], asr_options=asr_options, vad_method=vad_method, vad_options={"chunk_size":chunk_size, "vad_onset": vad_onset, "vad_offset": vad_offset}, task=task, threads=faster_whisper_threads) for audio_path in args.pop("audio"): audio = load_audio(audio_path) # >> VAD & ASR print(">>Performing transcription...") - result = model.transcribe(audio, batch_size=batch_size, chunk_size=chunk_size, print_progress=print_progress) + detect_language_per_segment = args.pop("detect_language_per_segment", False) + result = model.transcribe(audio, batch_size=batch_size, chunk_size=chunk_size, print_progress=print_progress, detect_language_per_segment=detect_language_per_segment) results.append((result, audio_path)) # Unload Whisper and VAD diff --git a/whisperx/types.py b/whisperx/types.py index 68f2d783..46f67439 100644 --- a/whisperx/types.py +++ b/whisperx/types.py @@ -5,15 +5,18 @@ class SingleWordSegment(TypedDict): """ A single word of a speech. """ + word: str start: float end: float score: float + class SingleCharSegment(TypedDict): """ A single char of a speech. """ + char: str start: float end: float @@ -30,7 +33,7 @@ class SingleSegment(TypedDict): text: str -class SingleAlignedSegment(TypedDict): +class SingleAlignedSegment(TypedDict, total=False): """ A single segment (up to multiple sentences) of a speech with word alignment. """ @@ -38,21 +41,27 @@ class SingleAlignedSegment(TypedDict): start: float end: float text: str + language: Optional[str] words: List[SingleWordSegment] chars: Optional[List[SingleCharSegment]] + speaker: Optional[str] class TranscriptionResult(TypedDict): """ A list of segments and word segments of a speech. """ + segments: List[SingleSegment] language: str + speakers: List[str] class AlignedTranscriptionResult(TypedDict): """ A list of segments and word segments of a speech. """ + segments: List[SingleAlignedSegment] word_segments: List[SingleWordSegment] + speakers: List[str] diff --git a/whisperx/vads/__init__.py b/whisperx/vads/__init__.py new file mode 100644 index 00000000..9dd82bf7 --- /dev/null +++ b/whisperx/vads/__init__.py @@ -0,0 +1,3 @@ +from whisperx.vads.pyannote import Pyannote +from whisperx.vads.silero import Silero +from whisperx.vads.vad import Vad \ No newline at end of file diff --git a/whisperx/vad.py b/whisperx/vads/pyannote.py similarity index 54% rename from whisperx/vad.py rename to whisperx/vads/pyannote.py index ab2c7bbf..69b2b601 100644 --- a/whisperx/vad.py +++ b/whisperx/vads/pyannote.py @@ -1,66 +1,29 @@ import hashlib import os import urllib -from typing import Callable, Optional, Text, Union +from typing import Callable, Text, Union +from typing import Optional +import warnings import numpy as np -import pandas as pd import torch from pyannote.audio import Model from pyannote.audio.core.io import AudioFile from pyannote.audio.pipelines import VoiceActivityDetection from pyannote.audio.pipelines.utils import PipelineModel -from pyannote.core import Annotation, Segment, SlidingWindowFeature +from pyannote.core import Annotation, SlidingWindowFeature +from pyannote.core import Segment from tqdm import tqdm -from .diarize import Segment as SegmentX +from whisperx.diarize import Segment as SegmentX +from whisperx.vads.vad import Vad VAD_SEGMENTATION_URL = "https://whisperx.s3.eu-west-2.amazonaws.com/model_weights/segmentation/0b5b3216d60a2d32fc086b47ea8c67589aaeb26b7e07fcbe620d6d0b83e209ea/pytorch_model.bin" -def load_vad_model(device, vad_onset=0.500, vad_offset=0.363, use_auth_token=None, model_fp=None): - model_dir = torch.hub._get_torch_home() - os.makedirs(model_dir, exist_ok = True) - if model_fp is None: - model_fp = os.path.join(model_dir, "whisperx-vad-segmentation.bin") - if os.path.exists(model_fp) and not os.path.isfile(model_fp): - raise RuntimeError(f"{model_fp} exists and is not a regular file") - - if not os.path.isfile(model_fp): - with urllib.request.urlopen(VAD_SEGMENTATION_URL) as source, open(model_fp, "wb") as output: - with tqdm( - total=int(source.info().get("Content-Length")), - ncols=80, - unit="iB", - unit_scale=True, - unit_divisor=1024, - ) as loop: - while True: - buffer = source.read(8192) - if not buffer: - break - - output.write(buffer) - loop.update(len(buffer)) - - model_bytes = open(model_fp, "rb").read() - if hashlib.sha256(model_bytes).hexdigest() != VAD_SEGMENTATION_URL.split('/')[-2]: - raise RuntimeError( - "Model has been downloaded but the SHA256 checksum does not not match. Please retry loading the model." - ) - - vad_model = Model.from_pretrained(model_fp, use_auth_token=use_auth_token) - hyperparameters = {"onset": vad_onset, - "offset": vad_offset, - "min_duration_on": 0.1, - "min_duration_off": 0.1} - vad_pipeline = VoiceActivitySegmentation(segmentation=vad_model, device=torch.device(device)) - vad_pipeline.instantiate(hyperparameters) - - return vad_pipeline class Binarize: """Binarize detection scores using hysteresis thresholding, with min-cut operation - to ensure not segments are longer than max_duration. + to ensure no segments are longer than max_duration. Parameters ---------- @@ -80,14 +43,15 @@ class Binarize: Defaults to 0s. max_duration: float The maximum length of an active segment, divides segment at timestamp with lowest score. + Reference --------- Gregory Gelly and Jean-Luc Gauvain. "Minimum Word Error Training of RNN-based Voice Activity Detection", InterSpeech 2015. - Modified by Max Bain to include WhisperX's min-cut operation + Modified by Max Bain to include WhisperX's min-cut operation https://arxiv.org/abs/2303.00747 - + Pyannote-audio """ @@ -99,7 +63,7 @@ def __init__( min_duration_off: float = 0.0, pad_onset: float = 0.0, pad_offset: float = 0.0, - max_duration: float = float('inf') + max_duration: float = float("inf"), ): super().__init__() @@ -145,18 +109,22 @@ def __call__(self, scores: SlidingWindowFeature) -> Annotation: t = start for t, y in zip(timestamps[1:], k_scores[1:]): # currently active - if is_active: + if is_active: curr_duration = t - start if curr_duration > self.max_duration: search_after = len(curr_scores) // 2 # divide segment - min_score_div_idx = search_after + np.argmin(curr_scores[search_after:]) + min_score_div_idx = search_after + np.argmin( + curr_scores[search_after:] + ) min_score_t = curr_timestamps[min_score_div_idx] - region = Segment(start - self.pad_onset, min_score_t + self.pad_offset) + region = Segment( + start - self.pad_onset, min_score_t + self.pad_offset + ) active[region, k] = label start = curr_timestamps[min_score_div_idx] - curr_scores = curr_scores[min_score_div_idx+1:] - curr_timestamps = curr_timestamps[min_score_div_idx+1:] + curr_scores = curr_scores[min_score_div_idx + 1 :] + curr_timestamps = curr_timestamps[min_score_div_idx + 1 :] # switching from active to inactive elif y < self.offset: region = Segment(start - self.pad_onset, t + self.pad_offset) @@ -181,11 +149,40 @@ def __call__(self, scores: SlidingWindowFeature) -> Annotation: # because of padding, some active regions might be overlapping: merge them. # also: fill same speaker gaps shorter than min_duration_off - if self.pad_offset > 0.0 or self.pad_onset > 0.0 or self.min_duration_off > 0.0: - if self.max_duration < float("inf"): - raise NotImplementedError(f"This would break current max_duration param") + if self.min_duration_off > 0.0: active = active.support(collar=self.min_duration_off) + # After applying min_duration_off, some segments might be longer than max_duration + # Reprocess those segments with min_duration_off set to 0 + if self.min_duration_off > 0.0 and self.max_duration < float("inf"): + new_min_duration_off = ( + self.min_duration_off / 2 + if self.min_duration_off > 1.0 + else max(0.0, self.min_duration_off - 0.5) + ) + binarizer_no_min_off = Binarize( + onset=self.onset, + offset=self.offset, + min_duration_on=self.min_duration_on, + min_duration_off=new_min_duration_off, + pad_onset=self.pad_onset, + pad_offset=self.pad_offset, + max_duration=self.max_duration, + ) + fixed_active = Annotation() + for segment, track in active.itertracks(): + if segment.duration <= self.max_duration: + fixed_active[segment, track] = active[segment, track] + else: + segment_scores = scores.crop( + segment, mode="strict", return_data=False + ) + segment_active = binarizer_no_min_off(segment_scores) + for seg, trk in segment_active.itertracks(): + fixed_active[seg, trk] = segment_active[seg, trk] + + active = fixed_active + # remove tracks shorter than min_duration_on if self.min_duration_on > 0: for segment, track in list(active.itertracks()): @@ -204,7 +201,12 @@ def __init__( **inference_kwargs, ): - super().__init__(segmentation=segmentation, fscore=fscore, use_auth_token=use_auth_token, **inference_kwargs) + super().__init__( + segmentation=segmentation, + fscore=fscore, + use_auth_token=use_auth_token, + **inference_kwargs, + ) def apply(self, file: AudioFile, hook: Optional[Callable] = None) -> Annotation: """Apply voice activity detection @@ -240,72 +242,92 @@ def apply(self, file: AudioFile, hook: Optional[Callable] = None) -> Annotation: return segmentations -def merge_vad(vad_arr, pad_onset=0.0, pad_offset=0.0, min_duration_off=0.0, min_duration_on=0.0): +class Pyannote(Vad): + + def __init__(self, device, use_auth_token=None, model_fp=None, **kwargs): + print(">>Performing voice activity detection using Pyannote...") + super().__init__(kwargs["vad_onset"]) + + model_dir = torch.hub._get_torch_home() + os.makedirs(model_dir, exist_ok=True) + if model_fp is None: + model_fp = os.path.join(model_dir, "whisperx-vad-segmentation.bin") + if os.path.exists(model_fp) and not os.path.isfile(model_fp): + raise RuntimeError(f"{model_fp} exists and is not a regular file") + + if not os.path.isfile(model_fp): + with urllib.request.urlopen(VAD_SEGMENTATION_URL) as source, open( + model_fp, "wb" + ) as output: + with tqdm( + total=int(source.info().get("Content-Length")), + ncols=80, + unit="iB", + unit_scale=True, + unit_divisor=1024, + ) as loop: + while True: + buffer = source.read(8192) + if not buffer: + break + + output.write(buffer) + loop.update(len(buffer)) + + model_bytes = open(model_fp, "rb").read() + if ( + hashlib.sha256(model_bytes).hexdigest() + != VAD_SEGMENTATION_URL.split("/")[-2] + ): + warnings.warn( + "Model has been downloaded but the SHA256 checksum does not not match. Please retry loading the model." + ) + + vad_model = Model.from_pretrained(model_fp, use_auth_token=use_auth_token) + hyperparameters = { + "onset": kwargs["vad_onset"], + "offset": kwargs["vad_offset"], + "min_duration_on": kwargs["vad_min_duration_on"], + "min_duration_off": kwargs["vad_min_duration_off"], + } + self.vad_pipeline = VoiceActivitySegmentation( + segmentation=vad_model, device=torch.device(device) + ) + self.vad_pipeline.instantiate(hyperparameters) - active = Annotation() - for k, vad_t in enumerate(vad_arr): - region = Segment(vad_t[0] - pad_onset, vad_t[1] + pad_offset) - active[region, k] = 1 + def __call__(self, audio: AudioFile, **kwargs): + return self.vad_pipeline(audio) + @staticmethod + def preprocess_audio(audio): + return torch.from_numpy(audio).unsqueeze(0) - if pad_offset > 0.0 or pad_onset > 0.0 or min_duration_off > 0.0: - active = active.support(collar=min_duration_off) - - # remove tracks shorter than min_duration_on - if min_duration_on > 0: - for segment, track in list(active.itertracks()): - if segment.duration < min_duration_on: - del active[segment, track] - - active = active.for_json() - active_segs = pd.DataFrame([x['segment'] for x in active['content']]) - return active_segs - -def merge_chunks( - segments, - chunk_size, - onset: float = 0.5, - offset: Optional[float] = None, -): - """ - Merge operation described in paper - """ - curr_end = 0 - merged_segments = [] - seg_idxs = [] - speaker_idxs = [] - - assert chunk_size > 0 - binarize = Binarize(max_duration=chunk_size, onset=onset, offset=offset) - segments = binarize(segments) - segments_list = [] - for speech_turn in segments.get_timeline(): - segments_list.append(SegmentX(speech_turn.start, speech_turn.end, "UNKNOWN")) - - if len(segments_list) == 0: - print("No active speech found in audio") - return [] - # assert segments_list, "segments_list is empty." - # Make sur the starting point is the start of the segment. - curr_start = segments_list[0].start - - for seg in segments_list: - if seg.end - curr_start > chunk_size and curr_end-curr_start > 0: - merged_segments.append({ - "start": curr_start, - "end": curr_end, - "segments": seg_idxs, - }) - curr_start = seg.start - seg_idxs = [] - speaker_idxs = [] - curr_end = seg.end - seg_idxs.append((seg.start, seg.end)) - speaker_idxs.append(seg.speaker) - # add final - merged_segments.append({ - "start": curr_start, - "end": curr_end, - "segments": seg_idxs, - }) - return merged_segments + @staticmethod + def merge_chunks( + segments, + chunk_size, + onset: float = 0.5, + offset: Optional[float] = None, + min_duration_on: float = 0.0, + min_duration_off: float = 0.0, + ): + assert chunk_size > 0 + binarize = Binarize( + max_duration=chunk_size, + onset=onset, + offset=offset, + min_duration_on=min_duration_on, + min_duration_off=min_duration_off, + ) + segments = binarize(segments) + segments_list = [] + for speech_turn in segments.get_timeline(): + segments_list.append( + SegmentX(speech_turn.start, speech_turn.end, "UNKNOWN") + ) + + if len(segments_list) == 0: + print("No active speech found in audio") + return [] + assert segments_list, "segments_list is empty." + return Vad.merge_chunks(segments_list, chunk_size, onset, offset) diff --git a/whisperx/vads/silero.py b/whisperx/vads/silero.py new file mode 100644 index 00000000..e7b44cc4 --- /dev/null +++ b/whisperx/vads/silero.py @@ -0,0 +1,62 @@ +from io import IOBase +from pathlib import Path +from typing import Mapping, Text +from typing import Optional +from typing import Union + +import torch + +from whisperx.diarize import Segment as SegmentX +from whisperx.vads.vad import Vad + +AudioFile = Union[Text, Path, IOBase, Mapping] + + +class Silero(Vad): + # check again default values + def __init__(self, **kwargs): + print(">>Performing voice activity detection using Silero...") + super().__init__(kwargs['vad_onset']) + + self.vad_onset = kwargs['vad_onset'] + self.chunk_size = kwargs['chunk_size'] + self.vad_pipeline, vad_utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', + model='silero_vad', + force_reload=False, + onnx=False, + trust_repo=True) + (self.get_speech_timestamps, _, self.read_audio, _, _) = vad_utils + + def __call__(self, audio: AudioFile, **kwargs): + """use silero to get segments of speech""" + # Only accept 16000 Hz for now. + # Note: Silero models support both 8000 and 16000 Hz. Although other values are not directly supported, + # multiples of 16000 (e.g. 32000 or 48000) are cast to 16000 inside of the JIT model! + sample_rate = audio["sample_rate"] + if sample_rate != 16000: + raise ValueError("Only 16000Hz sample rate is allowed") + + timestamps = self.get_speech_timestamps(audio["waveform"], + model=self.vad_pipeline, + sampling_rate=sample_rate, + max_speech_duration_s=self.chunk_size, + threshold=self.vad_onset + # min_silence_duration_ms = self.min_duration_off/1000 + # min_speech_duration_ms = self.min_duration_on/1000 + # ... + # See silero documentation for full option list + ) + return [SegmentX(i['start'] / sample_rate, i['end'] / sample_rate, "UNKNOWN") for i in timestamps] + + @staticmethod + def preprocess_audio(audio): + return audio + + @staticmethod + def merge_chunks(segments, + chunk_size, + onset: float = 0.5, + offset: Optional[float] = None, + ): + assert chunk_size > 0 + return Vad.merge_chunks(segments, chunk_size, onset, offset) diff --git a/whisperx/vads/vad.py b/whisperx/vads/vad.py new file mode 100644 index 00000000..d186deae --- /dev/null +++ b/whisperx/vads/vad.py @@ -0,0 +1,77 @@ +from typing import Optional + +import pandas as pd +from pyannote.core import Annotation, Segment + + +class Vad: + def __init__(self, vad_onset): + if not (0 < vad_onset < 1): + raise ValueError( + "vad_onset is a decimal value between 0 and 1." + ) + + @staticmethod + def preprocess_audio(audio): + pass + + # keep merge_chunks as static so it can be also used by manually assigned vad_model (see 'load_model') + @staticmethod + def merge_chunks(segments, + chunk_size, + onset: float, + offset: Optional[float]): + """ + Merge operation described in paper + """ + curr_end = 0 + merged_segments = [] + seg_idxs = [] + speaker_idxs = [] + + if not segments: + return [] + + curr_start = segments[0].start + for seg in segments: + if seg.end - curr_start > chunk_size and curr_end - curr_start > 0: + merged_segments.append({ + "start": curr_start, + "end": curr_end, + "segments": seg_idxs, + }) + curr_start = seg.start + seg_idxs = [] + speaker_idxs = [] + curr_end = seg.end + seg_idxs.append((seg.start, seg.end)) + speaker_idxs.append(seg.speaker) + # add final + merged_segments.append({ + "start": curr_start, + "end": curr_end, + "segments": seg_idxs, + }) + + return merged_segments + + # Unused function + @staticmethod + def merge_vad(vad_arr, pad_onset=0.0, pad_offset=0.0, min_duration_off=0.0, min_duration_on=0.0): + active = Annotation() + for k, vad_t in enumerate(vad_arr): + region = Segment(vad_t[0] - pad_onset, vad_t[1] + pad_offset) + active[region, k] = 1 + + if pad_offset > 0.0 or pad_onset > 0.0 or min_duration_off > 0.0: + active = active.support(collar=min_duration_off) + + # remove tracks shorter than min_duration_on + if min_duration_on > 0: + for segment, track in list(active.itertracks()): + if segment.duration < min_duration_on: + del active[segment, track] + + active = active.for_json() + active_segs = pd.DataFrame([x['segment'] for x in active['content']]) + return active_segs