From b42f3bbd032617403e71dc4931e160f72f25ce63 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 18 Sep 2024 11:26:28 +0200 Subject: [PATCH 1/6] docs: adding PESQ example to gallery --- examples/audio/pesq.py | 113 ++++++++++++++++++++++++ examples/audio/signal_to_noise_ratio.py | 12 +-- examples/image/clip_score.py | 5 ++ 3 files changed, 125 insertions(+), 5 deletions(-) create mode 100644 examples/audio/pesq.py diff --git a/examples/audio/pesq.py b/examples/audio/pesq.py new file mode 100644 index 00000000000..7f7b70599fd --- /dev/null +++ b/examples/audio/pesq.py @@ -0,0 +1,113 @@ +""" +PESQ Metric Calculation for Speech Enhancement +============================================== + +In this notebook, we will calculate the Perceptual Evaluation of Speech Quality (PESQ) score +to assess the improvement in speech quality after applying a basic noise reduction technique. + +PESQ is widely used in speech enhancement, telecommunications, and VoIP to evaluate the +perceived quality of speech signals. +""" + +#%% +# Import necessary libraries +import torch +import torchaudio +from torchmetrics.audio import PerceptualEvaluationSpeechQuality +import matplotlib.pyplot as plt +import numpy as np + +#%% +# Generate Synthetic Clean and Noisy Audio Signals +# We'll generate a clean sine wave (representing a clean speech signal) and add white noise to simulate the noisy version. + +def generate_sine_wave(frequency, duration, sample_rate, amplitude=0.5): + """Generate a clean sine wave at a given frequency.""" + t = torch.linspace(0, duration, int(sample_rate * duration)) + waveform = amplitude * torch.sin(2 * np.pi * frequency * t) + return waveform + +def add_noise(waveform, noise_factor=0.05): + """Add white noise to a waveform.""" + noise = noise_factor * torch.randn(waveform.size()) + noisy_waveform = waveform + noise + return noisy_waveform + +# Parameters for the synthetic audio +sample_rate = 16000 # 16 kHz typical for speech +duration = 3 # 3 seconds of audio +frequency = 440 # A4 note, can represent a simple speech-like tone + +# Generate the clean sine wave +clean_waveform = generate_sine_wave(frequency, duration, sample_rate) + +# Generate the noisy waveform by adding white noise +noisy_waveform = add_noise(clean_waveform) + + +#%% +# Apply Basic Noise Reduction Technique +# In this step, we apply a simple spectral gating method for noise reduction using torchaudio's +# `spectrogram` method. This is to simulate the enhancement of noisy speech. + +def reduce_noise(noisy_signal, sample_rate, threshold=0.2): + """Basic noise reduction using spectral gating.""" + # Compute the spectrogram + spec = torchaudio.transforms.Spectrogram()(noisy_signal) + + # Apply threshold-based gating: values below the threshold will be zeroed out + spec_denoised = spec * (spec > threshold) + + # Convert back to the waveform + inverse_spec = torchaudio.transforms.GriffinLim()(spec_denoised) + return inverse_spec + + +# Apply noise reduction to the noisy waveform +enhanced_waveform = reduce_noise(noisy_waveform, sample_rate) + +#%% +# Initialize the PESQ Metric +# PESQ can be computed in two modes: 'wb' (wideband) or 'nb' (narrowband). +# Here, we are using 'wb' mode for wideband speech quality evaluation. +pesq_metric = PerceptualEvaluationSpeechQuality(fs=sample_rate, mode='wb') + +#%% +# Compute PESQ Scores +# We will calculate the PESQ scores for both the noisy and enhanced versions +# compared to the clean signal. +# The PESQ scores give us a numerical evaluation of how well the enhanced speech +# compares to the clean speech. Higher scores indicate better quality. + +pesq_noisy = pesq_metric(clean_waveform, noisy_waveform) +pesq_enhanced = pesq_metric(clean_waveform, enhanced_waveform) + +print(f"PESQ Score for Noisy Audio: {pesq_noisy.item():.4f}") +print(f"PESQ Score for Enhanced Audio: {pesq_enhanced.item():.4f}") + +#%% +# Visualize the waveforms +# We can visualize the waveforms of the clean, noisy, and enhanced audio to see the differences. +fig, axs = plt.subplots(3, 1, figsize=(12, 9)) + +# Plot clean waveform +axs[0].plot(clean_waveform.numpy()) +axs[0].set_title("Clean Audio Waveform (Sine Wave)") +axs[0].set_xlabel("Time") +axs[0].set_ylabel("Amplitude") + +# Plot noisy waveform +axs[1].plot(noisy_waveform.numpy(), color='orange') +axs[1].set_title(f"Noisy Audio Waveform (PESQ: {pesq_noisy.item():.4f})") +axs[1].set_xlabel("Time") +axs[1].set_ylabel("Amplitude") + +# Plot enhanced waveform +axs[2].plot(enhanced_waveform.numpy(), color='green') +axs[2].set_title(f"Enhanced Audio Waveform (PESQ: {pesq_enhanced.item():.4f})") +axs[2].set_xlabel("Time") +axs[2].set_ylabel("Amplitude") + +# Adjust layout for better visualization +fig.tight_layout() +plt.show() \ No newline at end of file diff --git a/examples/audio/signal_to_noise_ratio.py b/examples/audio/signal_to_noise_ratio.py index 01285f58b70..c7130a895e4 100644 --- a/examples/audio/signal_to_noise_ratio.py +++ b/examples/audio/signal_to_noise_ratio.py @@ -16,13 +16,10 @@ import torch from torchmetrics.audio import SignalNoiseRatio -# Set seed for reproducibility -torch.manual_seed(42) -np.random.seed(42) - - # %% # Generate a clean signal (simulating a high-quality recording) + + def generate_clean_signal(length: int = 1000) -> Tuple[np.ndarray, np.ndarray]: """Generate a clean signal (sine wave)""" t = np.linspace(0, 1, length) @@ -32,6 +29,8 @@ def generate_clean_signal(length: int = 1000) -> Tuple[np.ndarray, np.ndarray]: # %% # Add Gaussian noise to the signal to simulate the noisy environment + + def add_noise(signal: np.ndarray, noise_level: float = 0.5) -> np.ndarray: """Add Gaussian noise to the signal.""" noise = noise_level * np.random.randn(signal.shape[0]) @@ -40,6 +39,8 @@ def add_noise(signal: np.ndarray, noise_level: float = 0.5) -> np.ndarray: # %% # Apply FFT to filter out the noise + + def fft_denoise(noisy_signal: np.ndarray, threshold: float) -> np.ndarray: """Denoise the signal using FFT.""" freq_domain = np.fft.fft(noisy_signal) # Filter frequencies using FFT @@ -50,6 +51,7 @@ def fft_denoise(noisy_signal: np.ndarray, threshold: float) -> np.ndarray: # %% # Generate and plot clean, noisy, and denoised signals to visualize the reconstruction + length = 1000 t, clean_signal = generate_clean_signal(length) noisy_signal = add_noise(clean_signal, noise_level=0.5) diff --git a/examples/image/clip_score.py b/examples/image/clip_score.py index e465ed8ce1f..f73c5d68333 100644 --- a/examples/image/clip_score.py +++ b/examples/image/clip_score.py @@ -19,6 +19,7 @@ # %% # Get sample images + images = { "astronaut": astronaut(), "cat": cat(), @@ -27,6 +28,7 @@ # %% # Define a hypothetical captions for the images + captions = [ "A photo of an astronaut.", "A photo of a cat.", @@ -35,6 +37,7 @@ # %% # Define the models for CLIPScore + models = [ "openai/clip-vit-base-patch16", # "openai/clip-vit-base-patch32", @@ -44,6 +47,7 @@ # %% # Collect scores for each image-caption pair + score_results = [] for model in models: clip_score = CLIPScore(model_name_or_path=model) @@ -54,6 +58,7 @@ # %% # Create an animation to display the scores + fig, (ax_img, ax_table) = plt.subplots(1, 2, figsize=(10, 5)) From 1563752e9030121be0ff43b1d9867c1a02665996 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 18 Sep 2024 09:28:59 +0000 Subject: [PATCH 2/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/audio/pesq.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/examples/audio/pesq.py b/examples/audio/pesq.py index 7f7b70599fd..22e3e183a2d 100644 --- a/examples/audio/pesq.py +++ b/examples/audio/pesq.py @@ -9,30 +9,33 @@ perceived quality of speech signals. """ -#%% +# %% # Import necessary libraries +import matplotlib.pyplot as plt +import numpy as np import torch import torchaudio from torchmetrics.audio import PerceptualEvaluationSpeechQuality -import matplotlib.pyplot as plt -import numpy as np -#%% +# %% # Generate Synthetic Clean and Noisy Audio Signals # We'll generate a clean sine wave (representing a clean speech signal) and add white noise to simulate the noisy version. + def generate_sine_wave(frequency, duration, sample_rate, amplitude=0.5): """Generate a clean sine wave at a given frequency.""" t = torch.linspace(0, duration, int(sample_rate * duration)) waveform = amplitude * torch.sin(2 * np.pi * frequency * t) return waveform + def add_noise(waveform, noise_factor=0.05): """Add white noise to a waveform.""" noise = noise_factor * torch.randn(waveform.size()) noisy_waveform = waveform + noise return noisy_waveform + # Parameters for the synthetic audio sample_rate = 16000 # 16 kHz typical for speech duration = 3 # 3 seconds of audio @@ -45,11 +48,12 @@ def add_noise(waveform, noise_factor=0.05): noisy_waveform = add_noise(clean_waveform) -#%% +# %% # Apply Basic Noise Reduction Technique # In this step, we apply a simple spectral gating method for noise reduction using torchaudio's # `spectrogram` method. This is to simulate the enhancement of noisy speech. + def reduce_noise(noisy_signal, sample_rate, threshold=0.2): """Basic noise reduction using spectral gating.""" # Compute the spectrogram @@ -66,13 +70,13 @@ def reduce_noise(noisy_signal, sample_rate, threshold=0.2): # Apply noise reduction to the noisy waveform enhanced_waveform = reduce_noise(noisy_waveform, sample_rate) -#%% +# %% # Initialize the PESQ Metric # PESQ can be computed in two modes: 'wb' (wideband) or 'nb' (narrowband). # Here, we are using 'wb' mode for wideband speech quality evaluation. -pesq_metric = PerceptualEvaluationSpeechQuality(fs=sample_rate, mode='wb') +pesq_metric = PerceptualEvaluationSpeechQuality(fs=sample_rate, mode="wb") -#%% +# %% # Compute PESQ Scores # We will calculate the PESQ scores for both the noisy and enhanced versions # compared to the clean signal. @@ -85,7 +89,7 @@ def reduce_noise(noisy_signal, sample_rate, threshold=0.2): print(f"PESQ Score for Noisy Audio: {pesq_noisy.item():.4f}") print(f"PESQ Score for Enhanced Audio: {pesq_enhanced.item():.4f}") -#%% +# %% # Visualize the waveforms # We can visualize the waveforms of the clean, noisy, and enhanced audio to see the differences. fig, axs = plt.subplots(3, 1, figsize=(12, 9)) @@ -97,17 +101,17 @@ def reduce_noise(noisy_signal, sample_rate, threshold=0.2): axs[0].set_ylabel("Amplitude") # Plot noisy waveform -axs[1].plot(noisy_waveform.numpy(), color='orange') +axs[1].plot(noisy_waveform.numpy(), color="orange") axs[1].set_title(f"Noisy Audio Waveform (PESQ: {pesq_noisy.item():.4f})") axs[1].set_xlabel("Time") axs[1].set_ylabel("Amplitude") # Plot enhanced waveform -axs[2].plot(enhanced_waveform.numpy(), color='green') +axs[2].plot(enhanced_waveform.numpy(), color="green") axs[2].set_title(f"Enhanced Audio Waveform (PESQ: {pesq_enhanced.item():.4f})") axs[2].set_xlabel("Time") axs[2].set_ylabel("Amplitude") # Adjust layout for better visualization fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() From 3a81e0833a2adc2a6d39ce571e46df6799a36b91 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Thu, 10 Oct 2024 19:27:45 +0200 Subject: [PATCH 3/6] lint --- examples/audio/pesq.py | 17 +++++++---------- pyproject.toml | 1 + 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/examples/audio/pesq.py b/examples/audio/pesq.py index 22e3e183a2d..13a9f228b68 100644 --- a/examples/audio/pesq.py +++ b/examples/audio/pesq.py @@ -22,18 +22,16 @@ # We'll generate a clean sine wave (representing a clean speech signal) and add white noise to simulate the noisy version. -def generate_sine_wave(frequency, duration, sample_rate, amplitude=0.5): +def generate_sine_wave(frequency, duration, sample_rate, amplitude: float = 0.5): """Generate a clean sine wave at a given frequency.""" t = torch.linspace(0, duration, int(sample_rate * duration)) - waveform = amplitude * torch.sin(2 * np.pi * frequency * t) - return waveform + return amplitude * torch.sin(2 * np.pi * frequency * t) -def add_noise(waveform, noise_factor=0.05): +def add_noise(waveform: torch.Tensor, noise_factor: float = 0.05) -> torch.Tensor: """Add white noise to a waveform.""" noise = noise_factor * torch.randn(waveform.size()) - noisy_waveform = waveform + noise - return noisy_waveform + return waveform + noise # Parameters for the synthetic audio @@ -54,7 +52,7 @@ def add_noise(waveform, noise_factor=0.05): # `spectrogram` method. This is to simulate the enhancement of noisy speech. -def reduce_noise(noisy_signal, sample_rate, threshold=0.2): +def reduce_noise(noisy_signal: torch.Tensor, threshold: float = 0.2) -> torch.Tensor: """Basic noise reduction using spectral gating.""" # Compute the spectrogram spec = torchaudio.transforms.Spectrogram()(noisy_signal) @@ -63,12 +61,11 @@ def reduce_noise(noisy_signal, sample_rate, threshold=0.2): spec_denoised = spec * (spec > threshold) # Convert back to the waveform - inverse_spec = torchaudio.transforms.GriffinLim()(spec_denoised) - return inverse_spec + return torchaudio.transforms.GriffinLim()(spec_denoised) # Apply noise reduction to the noisy waveform -enhanced_waveform = reduce_noise(noisy_waveform, sample_rate) +enhanced_waveform = reduce_noise(noisy_waveform) # %% # Initialize the PESQ Metric diff --git a/pyproject.toml b/pyproject.toml index 8b183d4c6b0..5a765978081 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,7 @@ lint.per-file-ignores."docs/source/conf.py" = [ "D103", ] lint.per-file-ignores."examples/*" = [ + "ANN", # any annotaions "D205", # 1 blank line required between summary line and description "D212", # [*] Multi-line docstring summary should start at the first line "D415", # First line should end with a period, question mark, or exclamation point From b20b4843e20ac51e987fa55f897a4747a6a47f51 Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Tue, 15 Oct 2024 11:49:56 +0200 Subject: [PATCH 4/6] Apply suggestions from code review Co-authored-by: Nicki Skafte Detlefsen --- examples/audio/pesq.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/audio/pesq.py b/examples/audio/pesq.py index 13a9f228b68..bb5465210a3 100644 --- a/examples/audio/pesq.py +++ b/examples/audio/pesq.py @@ -1,12 +1,10 @@ """ -PESQ Metric Calculation for Speech Enhancement +Evaluating Speech Quality with PESQ metric ============================================== -In this notebook, we will calculate the Perceptual Evaluation of Speech Quality (PESQ) score -to assess the improvement in speech quality after applying a basic noise reduction technique. +This notebook will guide you through calculating the Perceptual Evaluation of Speech Quality (PESQ) score, a key metric in assessing how effective noise reduction and enhancement techniques are in improving speech quality. PESQ is widely adopted in industries such as telecommunications, VoIP, and audio processing. It provides an objective way to measure the perceived quality of speech signals from a human listener's perspective. -PESQ is widely used in speech enhancement, telecommunications, and VoIP to evaluate the -perceived quality of speech signals. +Imagine you’re on a noisy street, trying to have a phone call. The technology behind the scenes aims to clean up your voice and make it sound clearer on the other end. But how do engineers measure that improvement? This is where PESQ comes in. In this notebook, we will simulate a similar scenario, applying a simple noise reduction technique and using the PESQ score to evaluate how much the speech quality improves. """ # %% From 159c30220f5111646ee3a49d428145aec4ac3af7 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 15 Oct 2024 12:58:40 +0200 Subject: [PATCH 5/6] lint --- examples/audio/pesq.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/examples/audio/pesq.py b/examples/audio/pesq.py index bb5465210a3..79b82115738 100644 --- a/examples/audio/pesq.py +++ b/examples/audio/pesq.py @@ -2,9 +2,15 @@ Evaluating Speech Quality with PESQ metric ============================================== -This notebook will guide you through calculating the Perceptual Evaluation of Speech Quality (PESQ) score, a key metric in assessing how effective noise reduction and enhancement techniques are in improving speech quality. PESQ is widely adopted in industries such as telecommunications, VoIP, and audio processing. It provides an objective way to measure the perceived quality of speech signals from a human listener's perspective. - -Imagine you’re on a noisy street, trying to have a phone call. The technology behind the scenes aims to clean up your voice and make it sound clearer on the other end. But how do engineers measure that improvement? This is where PESQ comes in. In this notebook, we will simulate a similar scenario, applying a simple noise reduction technique and using the PESQ score to evaluate how much the speech quality improves. +This notebook will guide you through calculating the Perceptual Evaluation of Speech Quality (PESQ) score, + a key metric in assessing how effective noise reduction and enhancement techniques are in improving speech quality. +PESQ is widely adopted in industries such as telecommunications, VoIP, and audio processing. +It provides an objective way to measure the perceived quality of speech signals from a human listener's perspective. + +Imagine being on a noisy street, trying to have a phone call. The technology behind the scenes aims + to clean up your voice and make it sound clearer on the other end. But how do engineers measure that improvement? +This is where PESQ comes in. In this notebook, we will simulate a similar scenario, applying a simple noise reduction + technique and using the PESQ score to evaluate how much the speech quality improves. """ # %% From b130cd639802338ffe844aab2dc5bd08ed4875a6 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 15 Oct 2024 19:23:58 +0200 Subject: [PATCH 6/6] fix? --- examples/audio/pesq.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/examples/audio/pesq.py b/examples/audio/pesq.py index 79b82115738..6afde2bfdd5 100644 --- a/examples/audio/pesq.py +++ b/examples/audio/pesq.py @@ -4,12 +4,12 @@ This notebook will guide you through calculating the Perceptual Evaluation of Speech Quality (PESQ) score, a key metric in assessing how effective noise reduction and enhancement techniques are in improving speech quality. -PESQ is widely adopted in industries such as telecommunications, VoIP, and audio processing. -It provides an objective way to measure the perceived quality of speech signals from a human listener's perspective. + PESQ is widely adopted in industries such as telecommunications, VoIP, and audio processing. + It provides an objective way to measure the perceived quality of speech signals from a human listener's perspective. Imagine being on a noisy street, trying to have a phone call. The technology behind the scenes aims to clean up your voice and make it sound clearer on the other end. But how do engineers measure that improvement? -This is where PESQ comes in. In this notebook, we will simulate a similar scenario, applying a simple noise reduction + This is where PESQ comes in. In this notebook, we will simulate a similar scenario, applying a simple noise reduction technique and using the PESQ score to evaluate how much the speech quality improves. """ @@ -79,8 +79,7 @@ def reduce_noise(noisy_signal: torch.Tensor, threshold: float = 0.2) -> torch.Te # %% # Compute PESQ Scores -# We will calculate the PESQ scores for both the noisy and enhanced versions -# compared to the clean signal. +# We will calculate the PESQ scores for both the noisy and enhanced versions compared to the clean signal. # The PESQ scores give us a numerical evaluation of how well the enhanced speech # compares to the clean speech. Higher scores indicate better quality.