diff --git a/OverlappedSpeech/.env b/OverlappedSpeech/.env new file mode 100644 index 0000000..2bf4e33 --- /dev/null +++ b/OverlappedSpeech/.env @@ -0,0 +1 @@ +HUGGING_FACE_TOKEN="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" \ No newline at end of file diff --git a/Speaker_Verification/IIsc_Voice_Sample.mp3 b/OverlappedSpeech/IIsc_Voice_Sample.mp3 similarity index 100% rename from Speaker_Verification/IIsc_Voice_Sample.mp3 rename to OverlappedSpeech/IIsc_Voice_Sample.mp3 diff --git a/OverlappedSpeech/README.md b/OverlappedSpeech/README.md new file mode 100644 index 0000000..c65e1d1 --- /dev/null +++ b/OverlappedSpeech/README.md @@ -0,0 +1,62 @@ +# Voice Activity Detection with Overlap Detection + +This project uses the WebRTC VAD module to perform Voice Activity Detection (VAD) on an audio file. It also applies noise reduction to the audio file before performing VAD. + +## Dependencies + +- Python 3 +- numpy +- librosa +- matplotlib +- webrtcvad +- soundfile +- noisereduce + +You can install these dependencies using pip: + +```bash +pip install numpy librosa matplotlib webrtcvad soundfile noisereduce +``` + +or + +```bash +pip install -r requirements.txt +``` + +## Usage + +1. Clone the repo + +```bash + git clone https://github.com/Manoj-2702/FacialAnalysis-IISc.git +``` + +2. Navigate to the directory + +```bash + cd FacialAnalysis-IISc/OverlappedSpeech +``` + +3. Run the script + +``` + python vad.py +``` + +```python +from vad import vad_with_overlap_detection + +# Replace 'your_audio_file.wav' with the path to your audio file +audio_file_path = 'your_audio_file.wav' + +# Run VAD with overlap detection and plot the graph +vad_with_overlap_detection(audio_file_path) + +``` + +4. The output will be a graph showing the overlapping speech regions. + +## Audio File + +Sample audio file is in this repo. iisc_1.mp3 diff --git a/OverlappedSpeech/iisc_1.mp3 b/OverlappedSpeech/iisc_1.mp3 new file mode 100644 index 0000000..cc0aa4e Binary files /dev/null and b/OverlappedSpeech/iisc_1.mp3 differ diff --git a/OverlappedSpeech/overlap.py b/OverlappedSpeech/overlap.py new file mode 100644 index 0000000..89bb125 --- /dev/null +++ b/OverlappedSpeech/overlap.py @@ -0,0 +1,16 @@ +# from pyannote.audio import Pipeline +from pyannote.audio.pipelines import OverlappedSpeechDetection +from dotenv import load_dotenv +import os +load_dotenv() + +hugging_face_token=os.environ.get("HUGGING_FACE_TOKEN") +pipeline = OverlappedSpeechDetection.from_pretrained("pyannote/overlapped-speech-detection",use_auth_token=hugging_face_token) +audio_file_path = "IIsc_Voice_Sample.mp3" + +output = pipeline({'uri': 'filename', 'audio': audio_file_path}) + +for speech in output['uri'].get_timeline().itertracks(): + # Two or more speakers are active between speech.start and speech.end + print(f"Overlapped Speech Detected: {speech.start} - {speech.end}") + diff --git a/OverlappedSpeech/vad.py b/OverlappedSpeech/vad.py new file mode 100644 index 0000000..750fa60 --- /dev/null +++ b/OverlappedSpeech/vad.py @@ -0,0 +1,66 @@ +import numpy as np +import librosa +import matplotlib.pyplot as plt +import webrtcvad +import soundfile as sf +import librosa.display +import noisereduce as nr + +def vad_with_overlap_detection(audio_file): + """ + Performs Voice Activity Detection on an audio file. + + Args: + audio_file (str): Path to the audio file. + + Returns: + Plots the waveform and VAD result of the Audio File + """ + # Load the audio file + y, sr = librosa.load(audio_file, sr=None) + y = librosa.to_mono(y) + y = nr.reduce_noise(y=y, sr=sr) + # Ensure the audio is mono + if len(y.shape) > 1: + y = np.mean(y, axis=1) + + # Resample to 16000 Hz + if sr != 16000: + y = librosa.resample(y, orig_sr=sr, target_sr=16000) + sr = 16000 + + # Create a VAD object + vad = webrtcvad.Vad() + + # Set its aggressiveness mode + vad.set_mode(2) + + # Convert the float audio data to int16 + y = (y * 32767).astype(np.int16) + + # Frame the audio data + frame_duration = 0.02 # 20 ms + frame_length = int(sr * frame_duration) + frames = [y[i:i+frame_length] for i in range(0, len(y), frame_length)] + + # Apply VAD on each frame + is_speech = [vad.is_speech(frame.tobytes(), sr) for frame in frames] + + + # Plot the waveform and VAD result + plt.figure(figsize=(12, 6)) + plt.subplot(2, 1, 1) + plt.plot(y) + plt.title('Waveform') + plt.subplot(2, 1, 2) + plt.plot(is_speech) + plt.title('VAD Result') + plt.tight_layout() + plt.show() + + +# Replace 'your_audio_file.wav' with the path to your audio file +audio_file_path = 'iisc_1.mp3' + +# Run VAD with overlap detection and plot the graph +vad_with_overlap_detection(audio_file_path) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 6a75968..7bc7810 100644 Binary files a/requirements.txt and b/requirements.txt differ