Overlap Speech Detection - Manoj

Manoj-2702 · Nov 24, 2023 · 0a5fc49 · 0a5fc49
1 parent 126ff63
commit 0a5fc49
Show file tree

Hide file tree

Showing 7 changed files with 145 additions and 0 deletions.
diff --git a/OverlappedSpeech/.env b/OverlappedSpeech/.env
@@ -0,0 +1 @@
+HUGGING_FACE_TOKEN="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"
diff --git a/Speaker_Verification/IIsc_Voice_Sample.mp3 → OverlappedSpeech/IIsc_Voice_Sample.mp3 b/Speaker_Verification/IIsc_Voice_Sample.mp3 → OverlappedSpeech/IIsc_Voice_Sample.mp3
diff --git a/OverlappedSpeech/README.md b/OverlappedSpeech/README.md
@@ -0,0 +1,62 @@
+# Voice Activity Detection with Overlap Detection
+
+This project uses the WebRTC VAD module to perform Voice Activity Detection (VAD) on an audio file. It also applies noise reduction to the audio file before performing VAD.
+
+## Dependencies
+
+- Python 3
+- numpy
+- librosa
+- matplotlib
+- webrtcvad
+- soundfile
+- noisereduce
+
+You can install these dependencies using pip:
+
+```bash
+pip install numpy librosa matplotlib webrtcvad soundfile noisereduce
+```
+
+or
+
+```bash
+pip install -r requirements.txt
+```
+
+## Usage
+
+1. Clone the repo
+
+```bash
+    git clone https://github.com/Manoj-2702/FacialAnalysis-IISc.git
+```
+
+2. Navigate to the directory
+
+```bash
+    cd FacialAnalysis-IISc/OverlappedSpeech
+```
+
+3. Run the script
+
+```
+    python vad.py
+```
+
+```python
+from vad import vad_with_overlap_detection
+
+# Replace 'your_audio_file.wav' with the path to your audio file
+audio_file_path = 'your_audio_file.wav'
+
+# Run VAD with overlap detection and plot the graph
+vad_with_overlap_detection(audio_file_path)
+
+```
+
+4. The output will be a graph showing the overlapping speech regions.
+
+## Audio File
+
+Sample audio file is in this repo. iisc_1.mp3
diff --git a/OverlappedSpeech/iisc_1.mp3 b/OverlappedSpeech/iisc_1.mp3
diff --git a/OverlappedSpeech/overlap.py b/OverlappedSpeech/overlap.py
@@ -0,0 +1,16 @@
+# from pyannote.audio import Pipeline
+from pyannote.audio.pipelines import OverlappedSpeechDetection
+from dotenv import load_dotenv
+import os
+load_dotenv()
+
+hugging_face_token=os.environ.get("HUGGING_FACE_TOKEN")
+pipeline = OverlappedSpeechDetection.from_pretrained("pyannote/overlapped-speech-detection",use_auth_token=hugging_face_token)
+audio_file_path = "IIsc_Voice_Sample.mp3"
+
+output = pipeline({'uri': 'filename', 'audio': audio_file_path})
+
+for speech in output['uri'].get_timeline().itertracks():
+    # Two or more speakers are active between speech.start and speech.end
+    print(f"Overlapped Speech Detected: {speech.start} - {speech.end}")
+
diff --git a/OverlappedSpeech/vad.py b/OverlappedSpeech/vad.py
@@ -0,0 +1,66 @@
+import numpy as np
+import librosa
+import matplotlib.pyplot as plt
+import webrtcvad
+import soundfile as sf
+import librosa.display
+import noisereduce as nr
+
+def vad_with_overlap_detection(audio_file):
+    """
+    Performs Voice Activity Detection on an audio file.
+
+    Args:
+        audio_file (str): Path to the audio file.
+    
+    Returns:
+        Plots the waveform and VAD result of the Audio File
+    """
+    # Load the audio file
+    y, sr = librosa.load(audio_file, sr=None)
+    y = librosa.to_mono(y)
+    y = nr.reduce_noise(y=y, sr=sr)
+    # Ensure the audio is mono
+    if len(y.shape) > 1:
+        y = np.mean(y, axis=1)
+
+    # Resample to 16000 Hz
+    if sr != 16000:
+        y = librosa.resample(y, orig_sr=sr, target_sr=16000)
+        sr = 16000
+
+    # Create a VAD object
+    vad = webrtcvad.Vad()
+
+    # Set its aggressiveness mode
+    vad.set_mode(2)
+
+    # Convert the float audio data to int16
+    y = (y * 32767).astype(np.int16)
+
+    # Frame the audio data
+    frame_duration = 0.02  # 20 ms
+    frame_length = int(sr * frame_duration)
+    frames = [y[i:i+frame_length] for i in range(0, len(y), frame_length)]
+
+    # Apply VAD on each frame
+    is_speech = [vad.is_speech(frame.tobytes(), sr) for frame in frames]
+
+
+    # Plot the waveform and VAD result
+    plt.figure(figsize=(12, 6))
+    plt.subplot(2, 1, 1)
+    plt.plot(y)
+    plt.title('Waveform')
+    plt.subplot(2, 1, 2)
+    plt.plot(is_speech)
+    plt.title('VAD Result')
+    plt.tight_layout()
+    plt.show()
+
+
+# Replace 'your_audio_file.wav' with the path to your audio file
+audio_file_path = 'iisc_1.mp3'
+
+# Run VAD with overlap detection and plot the graph
+vad_with_overlap_detection(audio_file_path)
diff --git a/requirements.txt b/requirements.txt
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		HUGGING_FACE_TOKEN="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"