-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
126ff63
commit 0a5fc49
Showing
7 changed files
with
145 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
HUGGING_FACE_TOKEN="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" |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
# Voice Activity Detection with Overlap Detection | ||
|
||
This project uses the WebRTC VAD module to perform Voice Activity Detection (VAD) on an audio file. It also applies noise reduction to the audio file before performing VAD. | ||
|
||
## Dependencies | ||
|
||
- Python 3 | ||
- numpy | ||
- librosa | ||
- matplotlib | ||
- webrtcvad | ||
- soundfile | ||
- noisereduce | ||
|
||
You can install these dependencies using pip: | ||
|
||
```bash | ||
pip install numpy librosa matplotlib webrtcvad soundfile noisereduce | ||
``` | ||
|
||
or | ||
|
||
```bash | ||
pip install -r requirements.txt | ||
``` | ||
|
||
## Usage | ||
|
||
1. Clone the repo | ||
|
||
```bash | ||
git clone https://github.com/Manoj-2702/FacialAnalysis-IISc.git | ||
``` | ||
|
||
2. Navigate to the directory | ||
|
||
```bash | ||
cd FacialAnalysis-IISc/OverlappedSpeech | ||
``` | ||
|
||
3. Run the script | ||
|
||
``` | ||
python vad.py | ||
``` | ||
|
||
```python | ||
from vad import vad_with_overlap_detection | ||
|
||
# Replace 'your_audio_file.wav' with the path to your audio file | ||
audio_file_path = 'your_audio_file.wav' | ||
|
||
# Run VAD with overlap detection and plot the graph | ||
vad_with_overlap_detection(audio_file_path) | ||
|
||
``` | ||
|
||
4. The output will be a graph showing the overlapping speech regions. | ||
|
||
## Audio File | ||
|
||
Sample audio file is in this repo. iisc_1.mp3 |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# from pyannote.audio import Pipeline | ||
from pyannote.audio.pipelines import OverlappedSpeechDetection | ||
from dotenv import load_dotenv | ||
import os | ||
load_dotenv() | ||
|
||
hugging_face_token=os.environ.get("HUGGING_FACE_TOKEN") | ||
pipeline = OverlappedSpeechDetection.from_pretrained("pyannote/overlapped-speech-detection",use_auth_token=hugging_face_token) | ||
audio_file_path = "IIsc_Voice_Sample.mp3" | ||
|
||
output = pipeline({'uri': 'filename', 'audio': audio_file_path}) | ||
|
||
for speech in output['uri'].get_timeline().itertracks(): | ||
# Two or more speakers are active between speech.start and speech.end | ||
print(f"Overlapped Speech Detected: {speech.start} - {speech.end}") | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
import numpy as np | ||
import librosa | ||
import matplotlib.pyplot as plt | ||
import webrtcvad | ||
import soundfile as sf | ||
import librosa.display | ||
import noisereduce as nr | ||
|
||
def vad_with_overlap_detection(audio_file): | ||
""" | ||
Performs Voice Activity Detection on an audio file. | ||
Args: | ||
audio_file (str): Path to the audio file. | ||
Returns: | ||
Plots the waveform and VAD result of the Audio File | ||
""" | ||
# Load the audio file | ||
y, sr = librosa.load(audio_file, sr=None) | ||
y = librosa.to_mono(y) | ||
y = nr.reduce_noise(y=y, sr=sr) | ||
# Ensure the audio is mono | ||
if len(y.shape) > 1: | ||
y = np.mean(y, axis=1) | ||
|
||
# Resample to 16000 Hz | ||
if sr != 16000: | ||
y = librosa.resample(y, orig_sr=sr, target_sr=16000) | ||
sr = 16000 | ||
|
||
# Create a VAD object | ||
vad = webrtcvad.Vad() | ||
|
||
# Set its aggressiveness mode | ||
vad.set_mode(2) | ||
|
||
# Convert the float audio data to int16 | ||
y = (y * 32767).astype(np.int16) | ||
|
||
# Frame the audio data | ||
frame_duration = 0.02 # 20 ms | ||
frame_length = int(sr * frame_duration) | ||
frames = [y[i:i+frame_length] for i in range(0, len(y), frame_length)] | ||
|
||
# Apply VAD on each frame | ||
is_speech = [vad.is_speech(frame.tobytes(), sr) for frame in frames] | ||
|
||
|
||
# Plot the waveform and VAD result | ||
plt.figure(figsize=(12, 6)) | ||
plt.subplot(2, 1, 1) | ||
plt.plot(y) | ||
plt.title('Waveform') | ||
plt.subplot(2, 1, 2) | ||
plt.plot(is_speech) | ||
plt.title('VAD Result') | ||
plt.tight_layout() | ||
plt.show() | ||
|
||
|
||
# Replace 'your_audio_file.wav' with the path to your audio file | ||
audio_file_path = 'iisc_1.mp3' | ||
|
||
# Run VAD with overlap detection and plot the graph | ||
vad_with_overlap_detection(audio_file_path) |
Binary file not shown.