diff --git a/docs/source/index.rst b/docs/source/index.rst
index 7001852df4..a49d99fce3 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -71,7 +71,6 @@ model implementations and application components.
tutorials/asr_inference_with_cuda_ctc_decoder_tutorial
tutorials/online_asr_tutorial
tutorials/device_asr
- tutorials/device_avsr
tutorials/forced_alignment_tutorial
tutorials/forced_alignment_for_multilingual_data_tutorial
tutorials/tacotron2_pipeline_tutorial
@@ -133,13 +132,6 @@ Tutorials
:link: tutorials/asr_inference_with_cuda_ctc_decoder_tutorial.html
:tags: Pipelines,ASR,CTC-Decoder,CUDA-CTC-Decoder
-.. customcarditem::
- :header: On device audio-visual automatic speech recognition
- :card_description: Learn how to stream audio and video from laptop webcam and perform audio-visual automatic speech recognition using Emformer-RNNT model.
- :image: https://download.pytorch.org/torchaudio/doc-assets/avsr/transformed.gif
- :link: tutorials/device_avsr.html
- :tags: I/O,Pipelines,RNNT
-
.. customcarditem::
:header: Loading waveform Tensors from files and saving them
:card_description: Learn how to query/load audio files and save waveform tensors to files, using torchaudio.info
, torchaudio.load
and torchaudio.save
functions.
diff --git a/examples/tutorials/device_avsr.py b/examples/tutorials/device_avsr.py
deleted file mode 100644
index 0bb7a5792d..0000000000
--- a/examples/tutorials/device_avsr.py
+++ /dev/null
@@ -1,408 +0,0 @@
-"""
-Device AV-ASR with Emformer RNN-T
-=================================
-
-**Author**: `Pingchuan Ma `__, `Moto
-Hira `__.
-
-This tutorial shows how to run on-device audio-visual speech recognition
-(AV-ASR, or AVSR) with TorchAudio on a streaming device input,
-i.e. microphone on laptop. AV-ASR is the task of transcribing text from
-audio and visual streams, which has recently attracted a lot of research
-attention due to its robustness against noise.
-
-.. note::
-
- This tutorial requires ffmpeg, sentencepiece, mediapipe,
- opencv-python and scikit-image libraries.
-
- There are multiple ways to install ffmpeg libraries.
- If you are using Anaconda Python
- distribution, ``conda install -c conda-forge 'ffmpeg<7'`` will
- install compatible FFmpeg libraries.
-
- You can run
- ``pip install sentencepiece mediapipe opencv-python scikit-image`` to
- install the other libraries mentioned.
-
-.. note::
-
- We do not have any pre-trained models available at this time. The
- following recipe uses placedholders for the sentencepiece model path
- ``spm_model_path`` and the pretrained model path ``avsr_model_path``.
-
- If you are interested in the training recipe for real-time AV-ASR
- models (AV-ASR), it can be found at `real-time
- AV-ASR `__
- recipe.
-
-.. note::
-
- To run this tutorial, please make sure you are in the `tutorial` folder.
-
-"""
-
-import numpy as np
-import sentencepiece as spm
-import torch
-import torchaudio
-import torchvision
-
-######################################################################
-# Overview
-# --------
-#
-# The real-time AV-ASR system is presented as follows, which consists of
-# three components, a data collection module, a pre-processing module and
-# an end-to-end model. The data collection module is hardware, such as a
-# microphone and camera. Its role is to collect information from the real
-# world. Once the information is collected, the pre-processing module
-# location and crop out face. Next, we feed the raw audio stream and the
-# pre-processed video stream into our end-to-end model for inference.
-#
-# .. image:: https://download.pytorch.org/torchaudio/doc-assets/avsr/overview.png
-#
-
-
-######################################################################
-# 1. Data acquisition
-# -------------------
-#
-# Firstly, we define the function to collect videos from microphone and
-# camera. To be specific, we use :py:class:`~torchaudio.io.StreamReader`
-# class for the purpose of data collection, which supports capturing
-# audio/video from microphone and camera. For the detailed usage of this
-# class, please refer to the
-# `tutorial <./streamreader_basic_tutorial.html>`__.
-#
-
-
-def stream(q, format, option, src, segment_length, sample_rate):
- print("Building StreamReader...")
- streamer = torchaudio.io.StreamReader(src=src, format=format, option=option)
- streamer.add_basic_video_stream(frames_per_chunk=segment_length, buffer_chunk_size=500, width=600, height=340)
- streamer.add_basic_audio_stream(frames_per_chunk=segment_length * 640, sample_rate=sample_rate)
-
- print(streamer.get_src_stream_info(0))
- print(streamer.get_src_stream_info(1))
- print("Streaming...")
- print()
- for (chunk_v, chunk_a) in streamer.stream(timeout=-1, backoff=1.0):
- q.put([chunk_v, chunk_a])
-
-
-class ContextCacher:
- def __init__(self, segment_length: int, context_length: int, rate_ratio: int):
- self.segment_length = segment_length
- self.context_length = context_length
-
- self.context_length_v = context_length
- self.context_length_a = context_length * rate_ratio
- self.context_v = torch.zeros([self.context_length_v, 3, 340, 600])
- self.context_a = torch.zeros([self.context_length_a, 1])
-
- def __call__(self, chunk_v, chunk_a):
- if chunk_v.size(0) < self.segment_length:
- chunk_v = torch.nn.functional.pad(chunk_v, (0, 0, 0, 0, 0, 0, 0, self.segment_length - chunk_v.size(0)))
- if chunk_a.size(0) < self.segment_length * 640:
- chunk_a = torch.nn.functional.pad(chunk_a, (0, 0, 0, self.segment_length * 640 - chunk_a.size(0)))
-
- if self.context_length == 0:
- return chunk_v.float(), chunk_a.float()
- else:
- chunk_with_context_v = torch.cat((self.context_v, chunk_v))
- chunk_with_context_a = torch.cat((self.context_a, chunk_a))
- self.context_v = chunk_v[-self.context_length_v :]
- self.context_a = chunk_a[-self.context_length_a :]
- return chunk_with_context_v.float(), chunk_with_context_a.float()
-
-
-######################################################################
-# 2. Pre-processing
-# -----------------
-#
-# Before feeding the raw stream into our model, each video sequence has to
-# undergo a specific pre-processing procedure. This involves three
-# critical steps. The first step is to perform face detection. Following
-# that, each individual frame is aligned to a referenced frame, commonly
-# known as the mean face, in order to normalize rotation and size
-# differences across frames. The final step in the pre-processing module
-# is to crop the face region from the aligned face image.
-#
-# .. list-table::
-# :widths: 25 25 25 25
-# :header-rows: 0
-#
-# * - .. image:: https://download.pytorch.org/torchaudio/doc-assets/avsr/original.gif
-# - .. image:: https://download.pytorch.org/torchaudio/doc-assets/avsr/detected.gif
-# - .. image:: https://download.pytorch.org/torchaudio/doc-assets/avsr/transformed.gif
-# - .. image:: https://download.pytorch.org/torchaudio/doc-assets/avsr/cropped.gif
-#
-# * - 0. Original
-# - 1. Detected
-# - 2. Transformed
-# - 3. Cropped
-#
-
-import sys
-
-sys.path.insert(0, "../../examples")
-
-from avsr.data_prep.detectors.mediapipe.detector import LandmarksDetector
-from avsr.data_prep.detectors.mediapipe.video_process import VideoProcess
-
-
-class FunctionalModule(torch.nn.Module):
- def __init__(self, functional):
- super().__init__()
- self.functional = functional
-
- def forward(self, input):
- return self.functional(input)
-
-
-class Preprocessing(torch.nn.Module):
- def __init__(self):
- super().__init__()
- self.landmarks_detector = LandmarksDetector()
- self.video_process = VideoProcess()
- self.video_transform = torch.nn.Sequential(
- FunctionalModule(
- lambda n: [(lambda x: torchvision.transforms.functional.resize(x, 44, antialias=True))(i) for i in n]
- ),
- FunctionalModule(lambda x: torch.stack(x)),
- torchvision.transforms.Normalize(0.0, 255.0),
- torchvision.transforms.CenterCrop(44),
- torchvision.transforms.Grayscale(),
- torchvision.transforms.Normalize(0.421, 0.165),
- )
-
- def forward(self, audio, video):
- video = video.permute(0, 2, 3, 1).cpu().numpy().astype(np.uint8)
- landmarks = self.landmarks_detector(video)
- video = self.video_process(video, landmarks)
- video = torch.tensor(video).permute(0, 3, 1, 2).float()
- video = self.video_transform(video)
- audio = audio.mean(axis=-1, keepdim=True)
- return audio, video
-
-
-######################################################################
-# 3. Building inference pipeline
-# ------------------------------
-#
-# The next step is to create components required for pipeline.
-#
-# We use convolutional-based front-ends to extract features from both the
-# raw audio and video streams. These features are then passed through a
-# two-layer MLP for fusion. For our transducer model, we leverage the
-# TorchAudio library, which incorporates an encoder (Emformer), a
-# predictor, and a joint network. The architecture of the proposed AV-ASR
-# model is illustrated as follows.
-#
-# .. image:: https://download.pytorch.org/torchaudio/doc-assets/avsr/architecture.png
-#
-
-from avsr.models.fusion import fusion_module
-from avsr.models.resnet import video_resnet
-from avsr.models.resnet1d import audio_resnet
-
-
-class AVSR(torch.nn.Module):
- def __init__(
- self,
- audio_frontend,
- video_frontend,
- fusion,
- model,
- ):
- super().__init__()
- self.audio_frontend = audio_frontend
- self.video_frontend = video_frontend
- self.fusion = fusion
- self.model = model
-
- def forward(self, audio, video):
- audio_features = self.audio_frontend(audio)
- video_features = self.video_frontend(video)
- return self.fusion(torch.cat([video_features, audio_features], dim=-1))
-
-
-class SentencePieceTokenProcessor:
- def __init__(self, sp_model):
- self.sp_model = sp_model
- self.post_process_remove_list = {
- self.sp_model.unk_id(),
- self.sp_model.eos_id(),
- self.sp_model.pad_id(),
- }
-
- def __call__(self, tokens, lstrip: bool = True) -> str:
- filtered_hypo_tokens = [
- token_index for token_index in tokens[1:] if token_index not in self.post_process_remove_list
- ]
- output_string = "".join(self.sp_model.id_to_piece(filtered_hypo_tokens)).replace("\u2581", " ")
-
- if lstrip:
- return output_string.lstrip()
- else:
- return output_string
-
-
-class InferencePipeline(torch.nn.Module):
- def __init__(self, preprocessor, model, decoder, token_processor):
- super().__init__()
- self.preprocessor = preprocessor
- self.model = model
- self.decoder = decoder
- self.token_processor = token_processor
-
- self.state = None
- self.hypothesis = None
-
- def forward(self, audio, video):
- audio, video = self.preprocessor(audio, video)
- feats = self.model(audio.unsqueeze(0), video.unsqueeze(0))
- length = torch.tensor([feats.size(1)], device=audio.device)
- hypos, self.state = self.decoder.infer(feats, length, 10, state=self.state, hypothesis=self.hypothesis)
- self.hypothesis = hypos[0]
- transcript = self.token_processor(self.hypothesis[0], lstrip=False)
- return transcript
-
-
-def _get_inference_pipeline(avsr_model_config, avsr_model_path, spm_model_path):
- model = AVSR(
- audio_frontend=audio_resnet(),
- video_frontend=video_resnet(),
- fusion=fusion_module(
- 1024,
- avsr_model_config["transformer_ffn_dim"],
- avsr_model_config["input_dim"],
- avsr_model_config["transformer_dropout"],
- ),
- model=torchaudio.models.emformer_rnnt_model(**avsr_model_config),
- )
- ckpt = torch.load(avsr_model_path, map_location=lambda storage, loc: storage)["state_dict"]
- model.load_state_dict(ckpt)
- model.eval()
-
- sp_model = spm.SentencePieceProcessor(model_file=spm_model_path)
- token_processor = SentencePieceTokenProcessor(sp_model)
-
- decoder = torchaudio.models.RNNTBeamSearch(model.model, sp_model.get_piece_size())
-
- return InferencePipeline(
- preprocessor=Preprocessing(),
- model=model,
- decoder=decoder,
- token_processor=token_processor,
- )
-
-
-######################################################################
-# 4. The main process
-# -------------------
-#
-# The execution flow of the main process is as follows:
-#
-# 1. Initialize the inference pipeline.
-# 2. Launch data acquisition subprocess.
-# 3. Run inference.
-# 4. Clean up
-#
-
-
-def main(device, src, option=None):
- print("Building pipeline...")
- spm_model_path = "../avsr/spm_unigram_1023.model"
- avsr_model_path = "../avsr/online_avsr_model.pth"
- avsr_model_config = {
- "input_dim": 512,
- "encoding_dim": 1024,
- "segment_length": 32,
- "right_context_length": 4,
- "time_reduction_input_dim": 768,
- "time_reduction_stride": 1,
- "transformer_num_heads": 12,
- "transformer_ffn_dim": 3072,
- "transformer_num_layers": 20,
- "transformer_dropout": 0.1,
- "transformer_activation": "gelu",
- "transformer_left_context_length": 30,
- "transformer_max_memory_size": 0,
- "transformer_weight_init_scale_strategy": "depthwise",
- "transformer_tanh_on_mem": True,
- "symbol_embedding_dim": 512,
- "num_lstm_layers": 3,
- "lstm_layer_norm": True,
- "lstm_layer_norm_epsilon": 0.001,
- "lstm_dropout": 0.3,
- "num_symbols": 1024,
- }
- pipeline = _get_inference_pipeline(avsr_model_config, avsr_model_path, spm_model_path)
-
- BUFFER_SIZE = 32
- segment_length = 8
- context_length = 4
- sample_rate = 19200
- frame_rate = 30
- rate_ratio = sample_rate // frame_rate
- cacher = ContextCacher(BUFFER_SIZE, context_length, rate_ratio)
-
- import torch.multiprocessing as mp
-
- ctx = mp.get_context("spawn")
-
- @torch.inference_mode()
- def infer():
- num_video_frames = 0
- video_chunks = []
- audio_chunks = []
- while True:
- chunk_v, chunk_a = q.get()
- num_video_frames += chunk_a.size(0) // 640
- video_chunks.append(chunk_v)
- audio_chunks.append(chunk_a)
- if num_video_frames < BUFFER_SIZE:
- continue
- video = torch.cat(video_chunks)
- audio = torch.cat(audio_chunks)
- video, audio = cacher(video, audio)
- pipeline.state, pipeline.hypothesis = None, None
- transcript = pipeline(audio, video.float())
- print(transcript, end="", flush=True)
- num_video_frames = 0
- video_chunks = []
- audio_chunks = []
-
- q = ctx.Queue()
- p = ctx.Process(target=stream, args=(q, device, option, src, segment_length, sample_rate))
- p.start()
- infer()
- p.join()
-
-
-if __name__ == "__main__":
- main(
- device="avfoundation",
- src="0:1",
- option={"framerate": "30", "pixel_format": "rgb24"},
- )
-
-######################################################################
-#
-# .. code::
-#
-# Building pipeline...
-# Building StreamReader...
-# SourceVideoStream(media_type='video', codec='rawvideo', codec_long_name='raw video', format='uyvy422', bit_rate=0, num_frames=0, bits_per_sample=0, metadata={}, width=1552, height=1552, frame_rate=1000000.0)
-# SourceAudioStream(media_type='audio', codec='pcm_f32le', codec_long_name='PCM 32-bit floating point little-endian', format='flt', bit_rate=1536000, num_frames=0, bits_per_sample=0, metadata={}, sample_rate=48000.0, num_channels=1)
-# Streaming...
-#
-# hello world
-#
-
-######################################################################
-#
-# Tag: :obj:`torchaudio.io`
-#