m-bain · cvl01 · Sep 26, 2024 · Oct 14, 2024 · Oct 15, 2024 · Oct 15, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
 whisperx.egg-info/
 **/__pycache__/
 .ipynb_checkpoints
+build/
+dist/
+.vscode/
diff --git a/README.md b/README.md
@@ -259,7 +259,7 @@ Bug finding and pull requests are also highly appreciated to keep this project g
 
 * [ ] Add benchmarking code (TEDLIUM for spd/WER & word segmentation)
 
-* [ ] Allow silero-vad as alternative VAD option
+* [x] Allow silero-vad as alternative VAD option
 
 * [ ] Improve diarization (word level). *Harder than first thought...*
 
@@ -281,7 +281,9 @@ Borrows important alignment code from [PyTorch tutorial on forced alignment](htt
 And uses the wonderful pyannote VAD / Diarization https://github.com/pyannote/pyannote-audio
 
 
-Valuable VAD & Diarization Models from [pyannote audio][https://github.com/pyannote/pyannote-audio]
+Valuable VAD & Diarization Models from:
+- [pyannote audio][https://github.com/pyannote/pyannote-audio]
+- [silero vad][https://github.com/snakers4/silero-vad]
 
 Great backend from [faster-whisper](https://github.com/guillaumekln/faster-whisper) and [CTranslate2](https://github.com/OpenNMT/CTranslate2)
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,36 @@
+[build-system]
+requires = ["setuptools>=65", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "whisperx"
+version = "3.1.1"
+description = "Time-Accurate Automatic Speech Recognition using Whisper."
+readme = "README.md"
+requires-python = ">=3.8"
+authors = [
+    {name = "Max Bain"}
+]
+license = {text = "MIT"}
+dependencies = [
+    "torch>=2",
+    "torchaudio>=2",
+    "faster-whisper==1.0.3",
+    "transformers",
+    "pandas",
+    "setuptools>=65",
+    "nltk",
+    "pyannote.audio==3.3.2"
+]
+
+[project.optional-dependencies]
+dev = ["pytest"]
+
+[project.scripts]
+whisperx = "whisperx.transcribe:cli"
+
+[tool.setuptools.packages.find]
+exclude = ["tests*"]
+
+[tool.setuptools]
+include-package-data = true
diff --git a/requirements.txt b/requirements.txt
diff --git a/setup.py b/setup.py
diff --git a/whisperx/__init__.py b/whisperx/__init__.py
@@ -1,4 +1,4 @@
-from .transcribe import load_model
 from .alignment import load_align_model, align
 from .audio import load_audio
-from .diarize import assign_word_speakers, DiarizationPipeline
+from .diarize import assign_word_speakers, DiarizationPipeline
+from .asr import load_model
diff --git a/whisperx/alignment.py b/whisperx/alignment.py
@@ -184,11 +184,13 @@ def align(
         t1 = segment["start"]
         t2 = segment["end"]
         text = segment["text"]
+        language = segment["language"]
 
         aligned_seg: SingleAlignedSegment = {
             "start": t1,
             "end": t2,
             "text": text,
+            "language": language,
             "words": [],
         }
 
@@ -324,6 +326,7 @@ def align(
                 "start": sentence_start,
                 "end": sentence_end,
                 "words": sentence_words,
+                "language": language
             })
 
             if return_char_alignments:
@@ -337,7 +340,7 @@ def align(
         aligned_subsegments["start"] = interpolate_nans(aligned_subsegments["start"], method=interpolate_method)
         aligned_subsegments["end"] = interpolate_nans(aligned_subsegments["end"], method=interpolate_method)
         # concatenate sentences with same timestamps
-        agg_dict = {"text": " ".join, "words": "sum"}
+        agg_dict = {"text": " ".join, "words": "sum", "language": "first"}
         if model_lang in LANGUAGES_WITHOUT_SPACES:
             agg_dict["text"] = "".join
         if return_char_alignments: