Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support Transformers in the Wav2Vec2 Encoder for the ASR Inference #1520

Merged
merged 20 commits into from
Nov 3, 2023
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
switch audio to the existing one
hkwon committed Oct 27, 2023

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
commit 11f5ff17a5ac3cd7ae104f87d7622bdb3baff0be
2 changes: 0 additions & 2 deletions python/tests/requirements.txt
Original file line number Diff line number Diff line change
@@ -6,5 +6,3 @@ tensorflow-cpu==2.11.*
pytest
wurlitzer==3.0.*;platform_system=='Linux'
torch
torchaudio
requests
20 changes: 10 additions & 10 deletions python/tests/test_transformers.py
Original file line number Diff line number Diff line change
@@ -957,7 +957,10 @@ def teardown_class(cls):
[
(
"facebook/wav2vec2-large-robust-ft-swbd-300h",
"I HAD THAT CURIOSITY BESIDE ME AT THIS MOMENT",
[
"MISTER QUILTER IS THE APOSSEL OF THE MIDDLE CLASSES AND"
" WE ARE GLAD TO WELCOME HIS GOSPEL",
],
),
],
)
@@ -969,7 +972,6 @@ def test_transformers_wav2vec2(
expected_transcription,
):
import torch
import torchaudio
import transformers

converter = ctranslate2.converters.TransformersConverter(
@@ -999,12 +1001,11 @@ def test_transformers_wav2vec2(
inter_threads=1,
)

waveform, sampling_rate = torchaudio.load(
os.path.join(test_utils.get_data_dir(), "audio", "test.wav")
speech_array = np.load(
os.path.join(test_utils.get_data_dir(), "audio", "mr_quilter.npy")
)
speech_array = waveform[0].numpy()
input_values = w2v2_processor(
speech_array.astype(np.float32),
speech_array,
padding=True,
return_tensors="pt",
sampling_rate=16000,
@@ -1070,7 +1071,6 @@ def test_transformers_wav2vec2(
logits = w2v2_model.lm_head(hidden_states.to(torch.float32))[0]

predicted_ids = torch.argmax(logits, dim=-1)
transcription = w2v2_processor.decode(predicted_ids, output_word_offsets=True)[
0
]
assert transcription == expected_transcription
transcription = w2v2_processor.decode(predicted_ids, output_word_offsets=True)

assert transcription[0] == expected_transcription[0]