From 5bbe75f1e5f00659fcc2abdd776b9b35e4661617 Mon Sep 17 00:00:00 2001 From: Hasan Mehdi Date: Thu, 30 May 2024 11:33:20 -0400 Subject: [PATCH 1/6] Updated readme embed-extraction pipeline --- README.md | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index f9a89dbc..b248604f 100644 --- a/README.md +++ b/README.md @@ -284,19 +284,22 @@ Obtain overlap-aware speaker embeddings from a microphone stream: ```python import rx.operators as ops import diart.operators as dops -from diart.sources import MicrophoneAudioSource +from diart.sources import MicrophoneAudioSource #, FileAudioSource from diart.blocks import SpeakerSegmentation, OverlapAwareSpeakerEmbedding -segmentation = SpeakerSegmentation.from_pretrained("pyannote/segmentation") -embedding = OverlapAwareSpeakerEmbedding.from_pretrained("pyannote/embedding") +segmentation = SpeakerSegmentation.from_pretrained("pyannote/segmentation", use_hf_token="") +embedding = OverlapAwareSpeakerEmbedding.from_pretrained("pyannote/embedding", use_hf_token="") + mic = MicrophoneAudioSource() +# To take input from file: +# mic = FileAudioSource("", sample_rate=16000) stream = mic.stream.pipe( # Reformat stream to 5s duration and 500ms shift dops.rearrange_audio_stream(sample_rate=segmentation.model.sample_rate), ops.map(lambda wav: (wav, segmentation(wav))), ops.starmap(embedding) -).subscribe(on_next=lambda emb: print(emb.shape)) +).subscribe(on_next=lambda emb: print(emb)) #emb.shape to display shape mic.read() ``` @@ -304,10 +307,13 @@ mic.read() Output: ``` -# Shape is (batch_size, num_speakers, embedding_dim) -torch.Size([1, 3, 512]) -torch.Size([1, 3, 512]) -torch.Size([1, 3, 512]) +# Displaying embeds: +tensor([[[-0.0442, -0.0327, -0.0910, ..., 0.0134, 0.0209, 0.0050], + [-0.0404, -0.0342, -0.0780, ..., 0.0395, 0.0334, -0.0140], + [-0.0404, -0.0342, -0.0780, ..., 0.0395, 0.0334, -0.0140]]]) +tensor([[[-0.0724, 0.0049, -0.0660, ..., 0.0359, 0.0247, -0.0256], + [-0.0462, -0.0256, -0.0642, ..., 0.0417, 0.0273, -0.0135], + [-0.0459, -0.0263, -0.0639, ..., 0.0412, 0.0269, -0.0131]]]) ... ``` From 9e4388997853c5211c440c66fd2bf81b122c4cc9 Mon Sep 17 00:00:00 2001 From: Hasan Mehdi Date: Thu, 30 May 2024 11:37:57 -0400 Subject: [PATCH 2/6] Updated readme embed-extraction pipeline --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b248604f..25138bea 100644 --- a/README.md +++ b/README.md @@ -296,7 +296,7 @@ mic = MicrophoneAudioSource() stream = mic.stream.pipe( # Reformat stream to 5s duration and 500ms shift - dops.rearrange_audio_stream(sample_rate=segmentation.model.sample_rate), + dops.rearrange_audio_stream(sample_rate=16000), ops.map(lambda wav: (wav, segmentation(wav))), ops.starmap(embedding) ).subscribe(on_next=lambda emb: print(emb)) #emb.shape to display shape From 629eb5effb6e2ed7d0ed9c24cb1e84abf0cef60c Mon Sep 17 00:00:00 2001 From: Juan Coria Date: Fri, 28 Jun 2024 23:49:37 +0200 Subject: [PATCH 3/6] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 25138bea..58e0368c 100644 --- a/README.md +++ b/README.md @@ -284,7 +284,7 @@ Obtain overlap-aware speaker embeddings from a microphone stream: ```python import rx.operators as ops import diart.operators as dops -from diart.sources import MicrophoneAudioSource #, FileAudioSource +from diart.sources import MicrophoneAudioSource, FileAudioSource from diart.blocks import SpeakerSegmentation, OverlapAwareSpeakerEmbedding segmentation = SpeakerSegmentation.from_pretrained("pyannote/segmentation", use_hf_token="") From 6b79055543f866e9b3a3c7df597565935c2ae59b Mon Sep 17 00:00:00 2001 From: Juan Coria Date: Fri, 28 Jun 2024 23:50:29 +0200 Subject: [PATCH 4/6] Apply suggestions from code review --- README.md | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 58e0368c..3a3f3f11 100644 --- a/README.md +++ b/README.md @@ -287,33 +287,30 @@ import diart.operators as dops from diart.sources import MicrophoneAudioSource, FileAudioSource from diart.blocks import SpeakerSegmentation, OverlapAwareSpeakerEmbedding -segmentation = SpeakerSegmentation.from_pretrained("pyannote/segmentation", use_hf_token="") -embedding = OverlapAwareSpeakerEmbedding.from_pretrained("pyannote/embedding", use_hf_token="") +segmentation = SpeakerSegmentation.from_pretrained("pyannote/segmentation") +embedding = OverlapAwareSpeakerEmbedding.from_pretrained("pyannote/embedding") -mic = MicrophoneAudioSource() +source = MicrophoneAudioSource() # To take input from file: -# mic = FileAudioSource("", sample_rate=16000) +# source = FileAudioSource("", sample_rate=16000) stream = mic.stream.pipe( # Reformat stream to 5s duration and 500ms shift - dops.rearrange_audio_stream(sample_rate=16000), + dops.rearrange_audio_stream(sample_rate=source.sample_rate), ops.map(lambda wav: (wav, segmentation(wav))), ops.starmap(embedding) -).subscribe(on_next=lambda emb: print(emb)) #emb.shape to display shape +).subscribe(on_next=lambda emb: print(emb.shape)) -mic.read() +source.read() ``` Output: ``` -# Displaying embeds: -tensor([[[-0.0442, -0.0327, -0.0910, ..., 0.0134, 0.0209, 0.0050], - [-0.0404, -0.0342, -0.0780, ..., 0.0395, 0.0334, -0.0140], - [-0.0404, -0.0342, -0.0780, ..., 0.0395, 0.0334, -0.0140]]]) -tensor([[[-0.0724, 0.0049, -0.0660, ..., 0.0359, 0.0247, -0.0256], - [-0.0462, -0.0256, -0.0642, ..., 0.0417, 0.0273, -0.0135], - [-0.0459, -0.0263, -0.0639, ..., 0.0412, 0.0269, -0.0131]]]) +# Shape is (batch_size, num_speakers, embedding_dim) +torch.Size([1, 3, 512]) +torch.Size([1, 3, 512]) +torch.Size([1, 3, 512]) ... ``` From 7478e872026a18991582cb710adc8ff2843f2269 Mon Sep 17 00:00:00 2001 From: Juan Coria Date: Fri, 28 Jun 2024 23:53:05 +0200 Subject: [PATCH 5/6] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 3a3f3f11..f55f670a 100644 --- a/README.md +++ b/README.md @@ -294,6 +294,9 @@ source = MicrophoneAudioSource() # To take input from file: # source = FileAudioSource("", sample_rate=16000) +# Make sure the model has been trained with the same sample rate +print(source.sample_rate) + stream = mic.stream.pipe( # Reformat stream to 5s duration and 500ms shift dops.rearrange_audio_stream(sample_rate=source.sample_rate), From 36e1a52e2823aac76826989d7ce8b98ffbb8938c Mon Sep 17 00:00:00 2001 From: Juan Coria Date: Fri, 28 Jun 2024 23:53:59 +0200 Subject: [PATCH 6/6] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f55f670a..42bf3da3 100644 --- a/README.md +++ b/README.md @@ -294,7 +294,7 @@ source = MicrophoneAudioSource() # To take input from file: # source = FileAudioSource("", sample_rate=16000) -# Make sure the model has been trained with the same sample rate +# Make sure the models have been trained with this sample rate print(source.sample_rate) stream = mic.stream.pipe(