diff --git a/docs/source/index.rst b/docs/source/index.rst
index 7001852df4..a49d99fce3 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -71,7 +71,6 @@ model implementations and application components.
    tutorials/asr_inference_with_cuda_ctc_decoder_tutorial
    tutorials/online_asr_tutorial
    tutorials/device_asr
-   tutorials/device_avsr
    tutorials/forced_alignment_tutorial
    tutorials/forced_alignment_for_multilingual_data_tutorial
    tutorials/tacotron2_pipeline_tutorial
@@ -133,13 +132,6 @@ Tutorials
    :link: tutorials/asr_inference_with_cuda_ctc_decoder_tutorial.html
    :tags: Pipelines,ASR,CTC-Decoder,CUDA-CTC-Decoder
 
-.. customcarditem::
-   :header: On device audio-visual automatic speech recognition
-   :card_description: Learn how to stream audio and video from laptop webcam and perform audio-visual automatic speech recognition using Emformer-RNNT model.
-   :image: https://download.pytorch.org/torchaudio/doc-assets/avsr/transformed.gif
-   :link: tutorials/device_avsr.html
-   :tags: I/O,Pipelines,RNNT
-
 .. customcarditem::
    :header: Loading waveform Tensors from files and saving them
    :card_description: Learn how to query/load audio files and save waveform tensors to files, using <code>torchaudio.info</code>, <code>torchaudio.load</code> and <code>torchaudio.save</code> functions.
diff --git a/examples/tutorials/device_avsr.py b/examples/tutorials/device_avsr.py
deleted file mode 100644
index 0bb7a5792d..0000000000
--- a/examples/tutorials/device_avsr.py
+++ /dev/null
@@ -1,408 +0,0 @@
-"""
-Device AV-ASR with Emformer RNN-T
-=================================
-
-**Author**: `Pingchuan Ma <pingchuanma@meta.com>`__, `Moto
-Hira <moto@meta.com>`__.
-
-This tutorial shows how to run on-device audio-visual speech recognition
-(AV-ASR, or AVSR) with TorchAudio on a streaming device input,
-i.e. microphone on laptop. AV-ASR is the task of transcribing text from
-audio and visual streams, which has recently attracted a lot of research
-attention due to its robustness against noise.
-
-.. note::
-
-   This tutorial requires ffmpeg, sentencepiece, mediapipe,
-   opencv-python and scikit-image libraries.
-
-   There are multiple ways to install ffmpeg libraries.
-   If you are using Anaconda Python
-   distribution, ``conda install -c conda-forge 'ffmpeg<7'`` will
-   install compatible FFmpeg libraries.
-
-   You can run
-   ``pip install sentencepiece mediapipe opencv-python scikit-image`` to
-   install the other libraries mentioned.
-
-.. note::
-
-   We do not have any pre-trained models available at this time. The
-   following recipe uses placedholders for the sentencepiece model path
-   ``spm_model_path`` and the pretrained model path ``avsr_model_path``.
-
-   If you are interested in the training recipe for real-time AV-ASR
-   models (AV-ASR), it can be found at `real-time
-   AV-ASR <https://github.com/pytorch/audio/tree/main/examples/avsr>`__
-   recipe.
-
-.. note::
-
-   To run this tutorial, please make sure you are in the `tutorial` folder.
-
-"""
-
-import numpy as np
-import sentencepiece as spm
-import torch
-import torchaudio
-import torchvision
-
-######################################################################
-# Overview
-# --------
-#
-# The real-time AV-ASR system is presented as follows, which consists of
-# three components, a data collection module, a pre-processing module and
-# an end-to-end model. The data collection module is hardware, such as a
-# microphone and camera. Its role is to collect information from the real
-# world. Once the information is collected, the pre-processing module
-# location and crop out face. Next, we feed the raw audio stream and the
-# pre-processed video stream into our end-to-end model for inference.
-#
-# .. image:: https://download.pytorch.org/torchaudio/doc-assets/avsr/overview.png
-#
-
-
-######################################################################
-# 1. Data acquisition
-# -------------------
-#
-# Firstly, we define the function to collect videos from microphone and
-# camera. To be specific, we use :py:class:`~torchaudio.io.StreamReader`
-# class for the purpose of data collection, which supports capturing
-# audio/video from microphone and camera. For the detailed usage of this
-# class, please refer to the
-# `tutorial <./streamreader_basic_tutorial.html>`__.
-#
-
-
-def stream(q, format, option, src, segment_length, sample_rate):
-    print("Building StreamReader...")
-    streamer = torchaudio.io.StreamReader(src=src, format=format, option=option)
-    streamer.add_basic_video_stream(frames_per_chunk=segment_length, buffer_chunk_size=500, width=600, height=340)
-    streamer.add_basic_audio_stream(frames_per_chunk=segment_length * 640, sample_rate=sample_rate)
-
-    print(streamer.get_src_stream_info(0))
-    print(streamer.get_src_stream_info(1))
-    print("Streaming...")
-    print()
-    for (chunk_v, chunk_a) in streamer.stream(timeout=-1, backoff=1.0):
-        q.put([chunk_v, chunk_a])
-
-
-class ContextCacher:
-    def __init__(self, segment_length: int, context_length: int, rate_ratio: int):
-        self.segment_length = segment_length
-        self.context_length = context_length
-
-        self.context_length_v = context_length
-        self.context_length_a = context_length * rate_ratio
-        self.context_v = torch.zeros([self.context_length_v, 3, 340, 600])
-        self.context_a = torch.zeros([self.context_length_a, 1])
-
-    def __call__(self, chunk_v, chunk_a):
-        if chunk_v.size(0) < self.segment_length:
-            chunk_v = torch.nn.functional.pad(chunk_v, (0, 0, 0, 0, 0, 0, 0, self.segment_length - chunk_v.size(0)))
-        if chunk_a.size(0) < self.segment_length * 640:
-            chunk_a = torch.nn.functional.pad(chunk_a, (0, 0, 0, self.segment_length * 640 - chunk_a.size(0)))
-
-        if self.context_length == 0:
-            return chunk_v.float(), chunk_a.float()
-        else:
-            chunk_with_context_v = torch.cat((self.context_v, chunk_v))
-            chunk_with_context_a = torch.cat((self.context_a, chunk_a))
-            self.context_v = chunk_v[-self.context_length_v :]
-            self.context_a = chunk_a[-self.context_length_a :]
-            return chunk_with_context_v.float(), chunk_with_context_a.float()
-
-
-######################################################################
-# 2. Pre-processing
-# -----------------
-#
-# Before feeding the raw stream into our model, each video sequence has to
-# undergo a specific pre-processing procedure. This involves three
-# critical steps. The first step is to perform face detection. Following
-# that, each individual frame is aligned to a referenced frame, commonly
-# known as the mean face, in order to normalize rotation and size
-# differences across frames. The final step in the pre-processing module
-# is to crop the face region from the aligned face image.
-#
-# .. list-table::
-#    :widths: 25 25 25 25
-#    :header-rows: 0
-#
-#    * - .. image:: https://download.pytorch.org/torchaudio/doc-assets/avsr/original.gif
-#      - .. image:: https://download.pytorch.org/torchaudio/doc-assets/avsr/detected.gif
-#      - .. image:: https://download.pytorch.org/torchaudio/doc-assets/avsr/transformed.gif
-#      - .. image:: https://download.pytorch.org/torchaudio/doc-assets/avsr/cropped.gif
-#
-#    * - 0. Original
-#      - 1. Detected
-#      - 2. Transformed
-#      - 3. Cropped
-#
-
-import sys
-
-sys.path.insert(0, "../../examples")
-
-from avsr.data_prep.detectors.mediapipe.detector import LandmarksDetector
-from avsr.data_prep.detectors.mediapipe.video_process import VideoProcess
-
-
-class FunctionalModule(torch.nn.Module):
-    def __init__(self, functional):
-        super().__init__()
-        self.functional = functional
-
-    def forward(self, input):
-        return self.functional(input)
-
-
-class Preprocessing(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.landmarks_detector = LandmarksDetector()
-        self.video_process = VideoProcess()
-        self.video_transform = torch.nn.Sequential(
-            FunctionalModule(
-                lambda n: [(lambda x: torchvision.transforms.functional.resize(x, 44, antialias=True))(i) for i in n]
-            ),
-            FunctionalModule(lambda x: torch.stack(x)),
-            torchvision.transforms.Normalize(0.0, 255.0),
-            torchvision.transforms.CenterCrop(44),
-            torchvision.transforms.Grayscale(),
-            torchvision.transforms.Normalize(0.421, 0.165),
-        )
-
-    def forward(self, audio, video):
-        video = video.permute(0, 2, 3, 1).cpu().numpy().astype(np.uint8)
-        landmarks = self.landmarks_detector(video)
-        video = self.video_process(video, landmarks)
-        video = torch.tensor(video).permute(0, 3, 1, 2).float()
-        video = self.video_transform(video)
-        audio = audio.mean(axis=-1, keepdim=True)
-        return audio, video
-
-
-######################################################################
-# 3. Building inference pipeline
-# ------------------------------
-#
-# The next step is to create components required for pipeline.
-#
-# We use convolutional-based front-ends to extract features from both the
-# raw audio and video streams. These features are then passed through a
-# two-layer MLP for fusion. For our transducer model, we leverage the
-# TorchAudio library, which incorporates an encoder (Emformer), a
-# predictor, and a joint network. The architecture of the proposed AV-ASR
-# model is illustrated as follows.
-#
-# .. image:: https://download.pytorch.org/torchaudio/doc-assets/avsr/architecture.png
-#
-
-from avsr.models.fusion import fusion_module
-from avsr.models.resnet import video_resnet
-from avsr.models.resnet1d import audio_resnet
-
-
-class AVSR(torch.nn.Module):
-    def __init__(
-        self,
-        audio_frontend,
-        video_frontend,
-        fusion,
-        model,
-    ):
-        super().__init__()
-        self.audio_frontend = audio_frontend
-        self.video_frontend = video_frontend
-        self.fusion = fusion
-        self.model = model
-
-    def forward(self, audio, video):
-        audio_features = self.audio_frontend(audio)
-        video_features = self.video_frontend(video)
-        return self.fusion(torch.cat([video_features, audio_features], dim=-1))
-
-
-class SentencePieceTokenProcessor:
-    def __init__(self, sp_model):
-        self.sp_model = sp_model
-        self.post_process_remove_list = {
-            self.sp_model.unk_id(),
-            self.sp_model.eos_id(),
-            self.sp_model.pad_id(),
-        }
-
-    def __call__(self, tokens, lstrip: bool = True) -> str:
-        filtered_hypo_tokens = [
-            token_index for token_index in tokens[1:] if token_index not in self.post_process_remove_list
-        ]
-        output_string = "".join(self.sp_model.id_to_piece(filtered_hypo_tokens)).replace("\u2581", " ")
-
-        if lstrip:
-            return output_string.lstrip()
-        else:
-            return output_string
-
-
-class InferencePipeline(torch.nn.Module):
-    def __init__(self, preprocessor, model, decoder, token_processor):
-        super().__init__()
-        self.preprocessor = preprocessor
-        self.model = model
-        self.decoder = decoder
-        self.token_processor = token_processor
-
-        self.state = None
-        self.hypothesis = None
-
-    def forward(self, audio, video):
-        audio, video = self.preprocessor(audio, video)
-        feats = self.model(audio.unsqueeze(0), video.unsqueeze(0))
-        length = torch.tensor([feats.size(1)], device=audio.device)
-        hypos, self.state = self.decoder.infer(feats, length, 10, state=self.state, hypothesis=self.hypothesis)
-        self.hypothesis = hypos[0]
-        transcript = self.token_processor(self.hypothesis[0], lstrip=False)
-        return transcript
-
-
-def _get_inference_pipeline(avsr_model_config, avsr_model_path, spm_model_path):
-    model = AVSR(
-        audio_frontend=audio_resnet(),
-        video_frontend=video_resnet(),
-        fusion=fusion_module(
-            1024,
-            avsr_model_config["transformer_ffn_dim"],
-            avsr_model_config["input_dim"],
-            avsr_model_config["transformer_dropout"],
-        ),
-        model=torchaudio.models.emformer_rnnt_model(**avsr_model_config),
-    )
-    ckpt = torch.load(avsr_model_path, map_location=lambda storage, loc: storage)["state_dict"]
-    model.load_state_dict(ckpt)
-    model.eval()
-
-    sp_model = spm.SentencePieceProcessor(model_file=spm_model_path)
-    token_processor = SentencePieceTokenProcessor(sp_model)
-
-    decoder = torchaudio.models.RNNTBeamSearch(model.model, sp_model.get_piece_size())
-
-    return InferencePipeline(
-        preprocessor=Preprocessing(),
-        model=model,
-        decoder=decoder,
-        token_processor=token_processor,
-    )
-
-
-######################################################################
-# 4. The main process
-# -------------------
-#
-# The execution flow of the main process is as follows:
-#
-# 1. Initialize the inference pipeline.
-# 2. Launch data acquisition subprocess.
-# 3. Run inference.
-# 4. Clean up
-#
-
-
-def main(device, src, option=None):
-    print("Building pipeline...")
-    spm_model_path = "../avsr/spm_unigram_1023.model"
-    avsr_model_path = "../avsr/online_avsr_model.pth"
-    avsr_model_config = {
-        "input_dim": 512,
-        "encoding_dim": 1024,
-        "segment_length": 32,
-        "right_context_length": 4,
-        "time_reduction_input_dim": 768,
-        "time_reduction_stride": 1,
-        "transformer_num_heads": 12,
-        "transformer_ffn_dim": 3072,
-        "transformer_num_layers": 20,
-        "transformer_dropout": 0.1,
-        "transformer_activation": "gelu",
-        "transformer_left_context_length": 30,
-        "transformer_max_memory_size": 0,
-        "transformer_weight_init_scale_strategy": "depthwise",
-        "transformer_tanh_on_mem": True,
-        "symbol_embedding_dim": 512,
-        "num_lstm_layers": 3,
-        "lstm_layer_norm": True,
-        "lstm_layer_norm_epsilon": 0.001,
-        "lstm_dropout": 0.3,
-        "num_symbols": 1024,
-    }
-    pipeline = _get_inference_pipeline(avsr_model_config, avsr_model_path, spm_model_path)
-
-    BUFFER_SIZE = 32
-    segment_length = 8
-    context_length = 4
-    sample_rate = 19200
-    frame_rate = 30
-    rate_ratio = sample_rate // frame_rate
-    cacher = ContextCacher(BUFFER_SIZE, context_length, rate_ratio)
-
-    import torch.multiprocessing as mp
-
-    ctx = mp.get_context("spawn")
-
-    @torch.inference_mode()
-    def infer():
-        num_video_frames = 0
-        video_chunks = []
-        audio_chunks = []
-        while True:
-            chunk_v, chunk_a = q.get()
-            num_video_frames += chunk_a.size(0) // 640
-            video_chunks.append(chunk_v)
-            audio_chunks.append(chunk_a)
-            if num_video_frames < BUFFER_SIZE:
-                continue
-            video = torch.cat(video_chunks)
-            audio = torch.cat(audio_chunks)
-            video, audio = cacher(video, audio)
-            pipeline.state, pipeline.hypothesis = None, None
-            transcript = pipeline(audio, video.float())
-            print(transcript, end="", flush=True)
-            num_video_frames = 0
-            video_chunks = []
-            audio_chunks = []
-
-    q = ctx.Queue()
-    p = ctx.Process(target=stream, args=(q, device, option, src, segment_length, sample_rate))
-    p.start()
-    infer()
-    p.join()
-
-
-if __name__ == "__main__":
-    main(
-        device="avfoundation",
-        src="0:1",
-        option={"framerate": "30", "pixel_format": "rgb24"},
-    )
-
-######################################################################
-#
-# .. code::
-#
-#    Building pipeline...
-#    Building StreamReader...
-#    SourceVideoStream(media_type='video', codec='rawvideo', codec_long_name='raw video', format='uyvy422', bit_rate=0, num_frames=0, bits_per_sample=0, metadata={}, width=1552, height=1552, frame_rate=1000000.0)
-#    SourceAudioStream(media_type='audio', codec='pcm_f32le', codec_long_name='PCM 32-bit floating point little-endian', format='flt', bit_rate=1536000, num_frames=0, bits_per_sample=0, metadata={}, sample_rate=48000.0, num_channels=1)
-#    Streaming...
-#
-#    hello world
-#
-
-######################################################################
-#
-# Tag: :obj:`torchaudio.io`
-#