From a564d10afe1a78c31934f0492422700f61a0ffc0 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Tue, 28 May 2024 18:07:07 +0100
Subject: [PATCH] Deprecate low use models (#30781)

* Deprecate models
- graphormer
- time_series_transformer
- xlm_prophetnet
- qdqbert
- nat
- ernie_m
- tvlt
- nezha
- mega
- jukebox
- vit_hybrid
- x_clip
- deta
- speech_to_text_2
- efficientformer
- realm
- gptsan_japanese

* Fix up

* Fix speech2text2 imports

* Make sure message isn't indented

* Fix docstrings

* Correctly map for deprecated models from model_type

* Uncomment out

* Add back time series transformer and x-clip

* Import fix and fix-up

* Fix up with updated ruff
---
 docs/source/en/model_doc/deta.md              |    8 +
 docs/source/en/model_doc/efficientformer.md   |   38 +-
 docs/source/en/model_doc/ernie_m.md           |    8 +
 docs/source/en/model_doc/gptsan-japanese.md   |    8 +
 docs/source/en/model_doc/graphormer.md        |   12 +-
 docs/source/en/model_doc/jukebox.md           |   10 +-
 docs/source/en/model_doc/mega.md              |   18 +-
 docs/source/en/model_doc/nat.md               |    8 +
 docs/source/en/model_doc/nezha.md             |   14 +-
 docs/source/en/model_doc/qdqbert.md           |    8 +
 docs/source/en/model_doc/realm.md             |   10 +-
 docs/source/en/model_doc/speech_to_text_2.md  |    8 +
 docs/source/en/model_doc/tvlt.md              |   10 +-
 docs/source/en/model_doc/vit_hybrid.md        |    8 +
 docs/source/en/model_doc/xclip.md             |    2 +-
 docs/source/en/model_doc/xlm-prophetnet.md    |    8 +
 src/transformers/__init__.py                  |  692 ++++-----
 src/transformers/models/__init__.py           |   15 -
 .../models/auto/configuration_auto.py         |   21 +-
 .../models/{ => deprecated}/deta/__init__.py  |    2 +-
 .../deta/configuration_deta.py                |    6 +-
 .../deta/convert_deta_resnet_to_pytorch.py    |    0
 .../deta/convert_deta_swin_to_pytorch.py      |    0
 .../deta/image_processing_deta.py             |   12 +-
 .../{ => deprecated}/deta/modeling_deta.py    |   16 +-
 .../efficientformer/__init__.py               |    2 +-
 .../configuration_efficientformer.py          |    4 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |    0
 .../image_processing_efficientformer.py       |    8 +-
 .../modeling_efficientformer.py               |    8 +-
 .../modeling_tf_efficientformer.py            |   10 +-
 .../{ => deprecated}/ernie_m/__init__.py      |    2 +-
 .../ernie_m/configuration_ernie_m.py          |    2 +-
 .../ernie_m/modeling_ernie_m.py               |   10 +-
 .../ernie_m/tokenization_ernie_m.py           |    4 +-
 .../gptsan_japanese/__init__.py               |    2 +-
 .../configuration_gptsan_japanese.py          |    4 +-
 ...convert_gptsan_tf_checkpoint_to_pytorch.py |    0
 .../modeling_gptsan_japanese.py               |    8 +-
 .../tokenization_gptsan_japanese.py           |    6 +-
 .../{ => deprecated}/graphormer/__init__.py   |    2 +-
 .../graphormer/algos_graphormer.pyx           |    0
 .../graphormer/collating_graphormer.py        |    2 +-
 .../graphormer/configuration_graphormer.py    |    4 +-
 .../graphormer/modeling_graphormer.py         |    8 +-
 .../{ => deprecated}/jukebox/__init__.py      |    2 +-
 .../jukebox/configuration_jukebox.py          |    4 +-
 .../jukebox/convert_jukebox.py                |    0
 .../jukebox/modeling_jukebox.py               |    8 +-
 .../jukebox/tokenization_jukebox.py           |    8 +-
 .../models/{ => deprecated}/mega/__init__.py  |    2 +-
 .../mega/configuration_mega.py                |    6 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |    0
 .../{ => deprecated}/mega/modeling_mega.py    |   10 +-
 .../models/{ => deprecated}/nat/__init__.py   |    2 +-
 .../{ => deprecated}/nat/configuration_nat.py |    6 +-
 .../{ => deprecated}/nat/modeling_nat.py      |   12 +-
 .../models/{ => deprecated}/nezha/__init__.py |    2 +-
 .../nezha/configuration_nezha.py              |    2 +-
 .../{ => deprecated}/nezha/modeling_nezha.py  |   10 +-
 .../{ => deprecated}/qdqbert/__init__.py      |    2 +-
 .../qdqbert/configuration_qdqbert.py          |    4 +-
 .../qdqbert/modeling_qdqbert.py               |   10 +-
 .../models/{ => deprecated}/realm/__init__.py |    2 +-
 .../realm/configuration_realm.py              |    4 +-
 .../{ => deprecated}/realm/modeling_realm.py  |   10 +-
 .../{ => deprecated}/realm/retrieval_realm.py |    4 +-
 .../realm/tokenization_realm.py               |    6 +-
 .../realm/tokenization_realm_fast.py          |    6 +-
 .../speech_to_text_2/__init__.py              |    2 +-
 .../configuration_speech_to_text_2.py         |    4 +-
 .../modeling_speech_to_text_2.py              |   10 +-
 .../processing_speech_to_text_2.py            |    2 +-
 .../tokenization_speech_to_text_2.py          |    4 +-
 .../models/{ => deprecated}/tvlt/__init__.py  |    2 +-
 .../tvlt/configuration_tvlt.py                |    4 +-
 .../tvlt/feature_extraction_tvlt.py           |    6 +-
 .../tvlt/image_processing_tvlt.py             |    8 +-
 .../{ => deprecated}/tvlt/modeling_tvlt.py    |   10 +-
 .../{ => deprecated}/tvlt/processing_tvlt.py  |    2 +-
 .../{ => deprecated}/vit_hybrid/__init__.py   |    2 +-
 .../vit_hybrid/configuration_vit_hybrid.py    |    8 +-
 .../convert_vit_hybrid_timm_to_pytorch.py     |    0
 .../vit_hybrid/image_processing_vit_hybrid.py |    8 +-
 .../vit_hybrid/modeling_vit_hybrid.py         |   12 +-
 .../xlm_prophetnet/__init__.py                |    2 +-
 .../configuration_xlm_prophetnet.py           |    4 +-
 .../xlm_prophetnet/modeling_xlm_prophetnet.py |    8 +-
 .../tokenization_xlm_prophetnet.py            |    4 +-
 .../models/dinat/modeling_dinat.py            |   15 -
 src/transformers/utils/dummy_pt_objects.py    | 1112 +++++++-------
 .../utils/dummy_sentencepiece_objects.py      |   14 +-
 src/transformers/utils/dummy_tf_objects.py    |   48 +-
 .../utils/dummy_tokenizers_objects.py         |   14 +-
 .../utils/dummy_vision_objects.py             |   42 +-
 tests/models/deta/__init__.py                 |    0
 .../models/deta/test_image_processing_deta.py |  535 -------
 tests/models/deta/test_modeling_deta.py       |  671 ---------
 tests/models/efficientformer/__init__.py      |    0
 .../test_image_processing_efficientformer.py  |   99 --
 .../test_modeling_efficientformer.py          |  478 ------
 .../test_modeling_tf_efficientformer.py       |  409 ------
 tests/models/ernie_m/__init__.py              |    0
 tests/models/ernie_m/test_modeling_ernie_m.py |  323 ----
 .../ernie_m/test_tokenization_ernie_m.py      |  143 --
 tests/models/gptsan_japanese/__init__.py      |    0
 .../test_modeling_gptsan_japanese.py          |  476 ------
 .../test_tokenization_gptsan_japanese.py      |  218 ---
 tests/models/graphormer/__init__.py           |    0
 .../graphormer/test_modeling_graphormer.py    | 1300 -----------------
 tests/models/jukebox/__init__.py              |    0
 tests/models/jukebox/test_modeling_jukebox.py |  407 ------
 .../jukebox/test_tokenization_jukebox.py      |  209 ---
 tests/models/mega/__init__.py                 |    0
 tests/models/mega/test_modeling_mega.py       |  744 ----------
 tests/models/nat/__init__.py                  |    0
 tests/models/nat/test_modeling_nat.py         |  382 -----
 tests/models/nezha/__init__.py                |    0
 tests/models/nezha/test_modeling_nezha.py     |  489 -------
 tests/models/qdqbert/__init__.py              |    0
 tests/models/qdqbert/test_modeling_qdqbert.py |  573 --------
 tests/models/realm/__init__.py                |    0
 tests/models/realm/test_modeling_realm.py     |  554 -------
 tests/models/realm/test_retrieval_realm.py    |  187 ---
 tests/models/realm/test_tokenization_realm.py |  322 ----
 .../test_modeling_speech_encoder_decoder.py   |   42 -
 tests/models/speech_to_text_2/__init__.py     |    0
 .../test_modeling_speech_to_text_2.py         |  216 ---
 .../test_tokenization_speech_to_text_2.py     |   98 --
 tests/models/tvlt/__init__.py                 |    0
 .../tvlt/test_feature_extraction_tvlt.py      |  182 ---
 .../models/tvlt/test_image_processor_tvlt.py  |  294 ----
 tests/models/tvlt/test_modeling_tvlt.py       |  625 --------
 tests/models/tvlt/test_processor_tvlt.py      |  116 --
 tests/models/vit_hybrid/__init__.py           |    0
 .../vit_hybrid/test_modeling_vit_hybrid.py    |  281 ----
 tests/models/xlm_prophetnet/__init__.py       |    0
 .../test_modeling_xlm_prophetnet.py           |  150 --
 .../test_tokenization_xlm_prophetnet.py       |  154 --
 utils/check_config_attributes.py              |    6 -
 utils/deprecate_models.py                     |   17 +-
 utils/not_doctested.txt                       |   29 -
 142 files changed, 1313 insertions(+), 11913 deletions(-)
 rename src/transformers/models/{ => deprecated}/deta/__init__.py (94%)
 rename src/transformers/models/{ => deprecated}/deta/configuration_deta.py (99%)
 rename src/transformers/models/{ => deprecated}/deta/convert_deta_resnet_to_pytorch.py (100%)
 rename src/transformers/models/{ => deprecated}/deta/convert_deta_swin_to_pytorch.py (100%)
 rename src/transformers/models/{ => deprecated}/deta/image_processing_deta.py (99%)
 rename src/transformers/models/{ => deprecated}/deta/modeling_deta.py (99%)
 rename src/transformers/models/{ => deprecated}/efficientformer/__init__.py (99%)
 rename src/transformers/models/{ => deprecated}/efficientformer/configuration_efficientformer.py (98%)
 rename src/transformers/models/{ => deprecated}/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py (100%)
 rename src/transformers/models/{ => deprecated}/efficientformer/image_processing_efficientformer.py (98%)
 rename src/transformers/models/{ => deprecated}/efficientformer/modeling_efficientformer.py (99%)
 rename src/transformers/models/{ => deprecated}/efficientformer/modeling_tf_efficientformer.py (99%)
 rename src/transformers/models/{ => deprecated}/ernie_m/__init__.py (95%)
 rename src/transformers/models/{ => deprecated}/ernie_m/configuration_ernie_m.py (99%)
 rename src/transformers/models/{ => deprecated}/ernie_m/modeling_ernie_m.py (99%)
 rename src/transformers/models/{ => deprecated}/ernie_m/tokenization_ernie_m.py (99%)
 rename src/transformers/models/{ => deprecated}/gptsan_japanese/__init__.py (98%)
 rename src/transformers/models/{ => deprecated}/gptsan_japanese/configuration_gptsan_japanese.py (98%)
 rename src/transformers/models/{ => deprecated}/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py (100%)
 rename src/transformers/models/{ => deprecated}/gptsan_japanese/modeling_gptsan_japanese.py (99%)
 rename src/transformers/models/{ => deprecated}/gptsan_japanese/tokenization_gptsan_japanese.py (99%)
 rename src/transformers/models/{ => deprecated}/graphormer/__init__.py (93%)
 rename src/transformers/models/{ => deprecated}/graphormer/algos_graphormer.pyx (100%)
 rename src/transformers/models/{ => deprecated}/graphormer/collating_graphormer.py (98%)
 rename src/transformers/models/{ => deprecated}/graphormer/configuration_graphormer.py (99%)
 rename src/transformers/models/{ => deprecated}/graphormer/modeling_graphormer.py (99%)
 rename src/transformers/models/{ => deprecated}/jukebox/__init__.py (95%)
 rename src/transformers/models/{ => deprecated}/jukebox/configuration_jukebox.py (99%)
 rename src/transformers/models/{ => deprecated}/jukebox/convert_jukebox.py (100%)
 rename src/transformers/models/{ => deprecated}/jukebox/modeling_jukebox.py (99%)
 rename src/transformers/models/{ => deprecated}/jukebox/tokenization_jukebox.py (98%)
 rename src/transformers/models/{ => deprecated}/mega/__init__.py (98%)
 rename src/transformers/models/{ => deprecated}/mega/configuration_mega.py (99%)
 rename src/transformers/models/{ => deprecated}/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py (100%)
 rename src/transformers/models/{ => deprecated}/mega/modeling_mega.py (99%)
 rename src/transformers/models/{ => deprecated}/nat/__init__.py (94%)
 rename src/transformers/models/{ => deprecated}/nat/configuration_nat.py (97%)
 rename src/transformers/models/{ => deprecated}/nat/modeling_nat.py (99%)
 rename src/transformers/models/{ => deprecated}/nezha/__init__.py (94%)
 rename src/transformers/models/{ => deprecated}/nezha/configuration_nezha.py (99%)
 rename src/transformers/models/{ => deprecated}/nezha/modeling_nezha.py (99%)
 rename src/transformers/models/{ => deprecated}/qdqbert/__init__.py (96%)
 rename src/transformers/models/{ => deprecated}/qdqbert/configuration_qdqbert.py (98%)
 rename src/transformers/models/{ => deprecated}/qdqbert/modeling_qdqbert.py (99%)
 rename src/transformers/models/{ => deprecated}/realm/__init__.py (95%)
 rename src/transformers/models/{ => deprecated}/realm/configuration_realm.py (98%)
 rename src/transformers/models/{ => deprecated}/realm/modeling_realm.py (99%)
 rename src/transformers/models/{ => deprecated}/realm/retrieval_realm.py (99%)
 rename src/transformers/models/{ => deprecated}/realm/tokenization_realm.py (99%)
 rename src/transformers/models/{ => deprecated}/realm/tokenization_realm_fast.py (98%)
 rename src/transformers/models/{ => deprecated}/speech_to_text_2/__init__.py (98%)
 rename src/transformers/models/{ => deprecated}/speech_to_text_2/configuration_speech_to_text_2.py (98%)
 rename src/transformers/models/{ => deprecated}/speech_to_text_2/modeling_speech_to_text_2.py (99%)
 rename src/transformers/models/{ => deprecated}/speech_to_text_2/processing_speech_to_text_2.py (98%)
 rename src/transformers/models/{ => deprecated}/speech_to_text_2/tokenization_speech_to_text_2.py (98%)
 rename src/transformers/models/{ => deprecated}/tvlt/__init__.py (99%)
 rename src/transformers/models/{ => deprecated}/tvlt/configuration_tvlt.py (99%)
 rename src/transformers/models/{ => deprecated}/tvlt/feature_extraction_tvlt.py (98%)
 rename src/transformers/models/{ => deprecated}/tvlt/image_processing_tvlt.py (99%)
 rename src/transformers/models/{ => deprecated}/tvlt/modeling_tvlt.py (99%)
 rename src/transformers/models/{ => deprecated}/tvlt/processing_tvlt.py (98%)
 rename src/transformers/models/{ => deprecated}/vit_hybrid/__init__.py (94%)
 rename src/transformers/models/{ => deprecated}/vit_hybrid/configuration_vit_hybrid.py (97%)
 rename src/transformers/models/{ => deprecated}/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py (100%)
 rename src/transformers/models/{ => deprecated}/vit_hybrid/image_processing_vit_hybrid.py (98%)
 rename src/transformers/models/{ => deprecated}/vit_hybrid/modeling_vit_hybrid.py (98%)
 rename src/transformers/models/{ => deprecated}/xlm_prophetnet/__init__.py (95%)
 rename src/transformers/models/{ => deprecated}/xlm_prophetnet/configuration_xlm_prophetnet.py (99%)
 rename src/transformers/models/{ => deprecated}/xlm_prophetnet/modeling_xlm_prophetnet.py (99%)
 rename src/transformers/models/{ => deprecated}/xlm_prophetnet/tokenization_xlm_prophetnet.py (99%)
 delete mode 100644 tests/models/deta/__init__.py
 delete mode 100644 tests/models/deta/test_image_processing_deta.py
 delete mode 100644 tests/models/deta/test_modeling_deta.py
 delete mode 100644 tests/models/efficientformer/__init__.py
 delete mode 100644 tests/models/efficientformer/test_image_processing_efficientformer.py
 delete mode 100644 tests/models/efficientformer/test_modeling_efficientformer.py
 delete mode 100644 tests/models/efficientformer/test_modeling_tf_efficientformer.py
 delete mode 100644 tests/models/ernie_m/__init__.py
 delete mode 100644 tests/models/ernie_m/test_modeling_ernie_m.py
 delete mode 100644 tests/models/ernie_m/test_tokenization_ernie_m.py
 delete mode 100644 tests/models/gptsan_japanese/__init__.py
 delete mode 100644 tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py
 delete mode 100644 tests/models/gptsan_japanese/test_tokenization_gptsan_japanese.py
 delete mode 100644 tests/models/graphormer/__init__.py
 delete mode 100644 tests/models/graphormer/test_modeling_graphormer.py
 delete mode 100644 tests/models/jukebox/__init__.py
 delete mode 100644 tests/models/jukebox/test_modeling_jukebox.py
 delete mode 100644 tests/models/jukebox/test_tokenization_jukebox.py
 delete mode 100644 tests/models/mega/__init__.py
 delete mode 100644 tests/models/mega/test_modeling_mega.py
 delete mode 100644 tests/models/nat/__init__.py
 delete mode 100644 tests/models/nat/test_modeling_nat.py
 delete mode 100644 tests/models/nezha/__init__.py
 delete mode 100644 tests/models/nezha/test_modeling_nezha.py
 delete mode 100644 tests/models/qdqbert/__init__.py
 delete mode 100644 tests/models/qdqbert/test_modeling_qdqbert.py
 delete mode 100644 tests/models/realm/__init__.py
 delete mode 100644 tests/models/realm/test_modeling_realm.py
 delete mode 100644 tests/models/realm/test_retrieval_realm.py
 delete mode 100644 tests/models/realm/test_tokenization_realm.py
 delete mode 100644 tests/models/speech_to_text_2/__init__.py
 delete mode 100644 tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py
 delete mode 100644 tests/models/speech_to_text_2/test_tokenization_speech_to_text_2.py
 delete mode 100644 tests/models/tvlt/__init__.py
 delete mode 100644 tests/models/tvlt/test_feature_extraction_tvlt.py
 delete mode 100644 tests/models/tvlt/test_image_processor_tvlt.py
 delete mode 100644 tests/models/tvlt/test_modeling_tvlt.py
 delete mode 100644 tests/models/tvlt/test_processor_tvlt.py
 delete mode 100644 tests/models/vit_hybrid/__init__.py
 delete mode 100644 tests/models/vit_hybrid/test_modeling_vit_hybrid.py
 delete mode 100644 tests/models/xlm_prophetnet/__init__.py
 delete mode 100644 tests/models/xlm_prophetnet/test_modeling_xlm_prophetnet.py
 delete mode 100644 tests/models/xlm_prophetnet/test_tokenization_xlm_prophetnet.py

diff --git a/docs/source/en/model_doc/deta.md b/docs/source/en/model_doc/deta.md
index cdda22af7bbf97..996142bc59d6b5 100644
--- a/docs/source/en/model_doc/deta.md
+++ b/docs/source/en/model_doc/deta.md
@@ -16,6 +16,14 @@ rendered properly in your Markdown viewer.
 
 # DETA
 
+<Tip warning={true}>
+
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+</Tip>
+
 ## Overview
 
 The DETA model was proposed in [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl.
diff --git a/docs/source/en/model_doc/efficientformer.md b/docs/source/en/model_doc/efficientformer.md
index 92ba90a9e5ed97..24b20793b03c9b 100644
--- a/docs/source/en/model_doc/efficientformer.md
+++ b/docs/source/en/model_doc/efficientformer.md
@@ -16,28 +16,36 @@ rendered properly in your Markdown viewer.
 
 # EfficientFormer
 
+<Tip warning={true}>
+
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+</Tip>
+
 ## Overview
 
-The EfficientFormer model was proposed in [EfficientFormer: Vision Transformers at MobileNet Speed](https://arxiv.org/abs/2206.01191) 
+The EfficientFormer model was proposed in [EfficientFormer: Vision Transformers at MobileNet Speed](https://arxiv.org/abs/2206.01191)
 by Yanyu Li, Geng Yuan, Yang Wen, Eric Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.  EfficientFormer proposes a
 dimension-consistent pure transformer that can be run on mobile devices for dense prediction tasks like image classification, object
 detection and semantic segmentation.
 
 The abstract from the paper is the following:
 
-*Vision Transformers (ViT) have shown rapid progress in computer vision tasks, achieving promising results on various benchmarks. 
-However, due to the massive number of parameters and model design, e.g., attention mechanism, ViT-based models are generally 
-times slower than lightweight convolutional networks. Therefore, the deployment of ViT for real-time applications is particularly 
-challenging, especially on resource-constrained hardware such as mobile devices. Recent efforts try to reduce the computation 
-complexity of ViT through network architecture search or hybrid design with MobileNet block, yet the inference speed is still 
-unsatisfactory. This leads to an important question: can transformers run as fast as MobileNet while obtaining high performance? 
-To answer this, we first revisit the network architecture and operators used in ViT-based models and identify inefficient designs. 
-Then we introduce a dimension-consistent pure transformer (without MobileNet blocks) as a design paradigm. 
-Finally, we perform latency-driven slimming to get a series of final models dubbed EfficientFormer. 
-Extensive experiments show the superiority of EfficientFormer in performance and speed on mobile devices. 
-Our fastest model, EfficientFormer-L1, achieves 79.2% top-1 accuracy on ImageNet-1K with only 1.6 ms inference latency on 
-iPhone 12 (compiled with CoreML), which { runs as fast as MobileNetV2×1.4 (1.6 ms, 74.7% top-1),} and our largest model, 
-EfficientFormer-L7, obtains 83.3% accuracy with only 7.0 ms latency. Our work proves that properly designed transformers can 
+*Vision Transformers (ViT) have shown rapid progress in computer vision tasks, achieving promising results on various benchmarks.
+However, due to the massive number of parameters and model design, e.g., attention mechanism, ViT-based models are generally
+times slower than lightweight convolutional networks. Therefore, the deployment of ViT for real-time applications is particularly
+challenging, especially on resource-constrained hardware such as mobile devices. Recent efforts try to reduce the computation
+complexity of ViT through network architecture search or hybrid design with MobileNet block, yet the inference speed is still
+unsatisfactory. This leads to an important question: can transformers run as fast as MobileNet while obtaining high performance?
+To answer this, we first revisit the network architecture and operators used in ViT-based models and identify inefficient designs.
+Then we introduce a dimension-consistent pure transformer (without MobileNet blocks) as a design paradigm.
+Finally, we perform latency-driven slimming to get a series of final models dubbed EfficientFormer.
+Extensive experiments show the superiority of EfficientFormer in performance and speed on mobile devices.
+Our fastest model, EfficientFormer-L1, achieves 79.2% top-1 accuracy on ImageNet-1K with only 1.6 ms inference latency on
+iPhone 12 (compiled with CoreML), which { runs as fast as MobileNetV2×1.4 (1.6 ms, 74.7% top-1),} and our largest model,
+EfficientFormer-L7, obtains 83.3% accuracy with only 7.0 ms latency. Our work proves that properly designed transformers can
 reach extremely low latency on mobile devices while maintaining high performance.*
 
 This model was contributed by [novice03](https://huggingface.co/novice03) and [Bearnardd](https://huggingface.co/Bearnardd).
@@ -93,4 +101,4 @@ The original code can be found [here](https://github.com/snap-research/Efficient
     - call
 
 </tf>
-</frameworkcontent>
\ No newline at end of file
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/ernie_m.md b/docs/source/en/model_doc/ernie_m.md
index a99332cb655ac5..85254693501c80 100644
--- a/docs/source/en/model_doc/ernie_m.md
+++ b/docs/source/en/model_doc/ernie_m.md
@@ -16,6 +16,14 @@ rendered properly in your Markdown viewer.
 
 # ErnieM
 
+<Tip warning={true}>
+
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+</Tip>
+
 ## Overview
 
 The ErnieM model was proposed in [ERNIE-M: Enhanced Multilingual Representation by Aligning
diff --git a/docs/source/en/model_doc/gptsan-japanese.md b/docs/source/en/model_doc/gptsan-japanese.md
index 1e6b1b6e1cf6d7..108e59048d5d52 100644
--- a/docs/source/en/model_doc/gptsan-japanese.md
+++ b/docs/source/en/model_doc/gptsan-japanese.md
@@ -16,6 +16,14 @@ rendered properly in your Markdown viewer.
 
 # GPTSAN-japanese
 
+<Tip warning={true}>
+
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+</Tip>
+
 ## Overview
 
 The GPTSAN-japanese model was released in the repository by Toshiyuki Sakamoto (tanreinama).
diff --git a/docs/source/en/model_doc/graphormer.md b/docs/source/en/model_doc/graphormer.md
index 08e3f5fb3e9b5a..d01bf04debf9dd 100644
--- a/docs/source/en/model_doc/graphormer.md
+++ b/docs/source/en/model_doc/graphormer.md
@@ -1,7 +1,7 @@
 <!--Copyright 2022 The HuggingFace Team and Microsoft. All rights reserved.
 
 Licensed under the MIT License; you may not use this file except in compliance with
-the License. 
+the License.
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
@@ -14,9 +14,17 @@ rendered properly in your Markdown viewer.
 
 # Graphormer
 
+<Tip warning={true}>
+
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+</Tip>
+
 ## Overview
 
-The Graphormer model was proposed in [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234)  by 
+The Graphormer model was proposed in [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234)  by
 Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen and Tie-Yan Liu. It is a Graph Transformer model, modified to allow computations on graphs instead of text sequences by generating embeddings and features of interest during preprocessing and collation, then using a modified attention.
 
 The abstract from the paper is the following:
diff --git a/docs/source/en/model_doc/jukebox.md b/docs/source/en/model_doc/jukebox.md
index 578a8a91dd02ea..12f273b71e972c 100644
--- a/docs/source/en/model_doc/jukebox.md
+++ b/docs/source/en/model_doc/jukebox.md
@@ -15,6 +15,14 @@ rendered properly in your Markdown viewer.
 -->
 # Jukebox
 
+<Tip warning={true}>
+
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+</Tip>
+
 ## Overview
 
 The Jukebox model was proposed in [Jukebox: A generative model for music](https://arxiv.org/pdf/2005.00341.pdf)
@@ -27,7 +35,7 @@ The abstract from the paper is the following:
 *We introduce Jukebox, a model that generates music with singing in the raw audio domain. We tackle the long context of raw audio using a multiscale VQ-VAE to compress it to discrete codes, and modeling those using autoregressive Transformers. We show that the combined model at scale can generate high-fidelity and diverse songs with coherence up to multiple minutes. We can condition on artist and genre to steer the musical and vocal style, and on unaligned lyrics to make the singing more controllable. We are releasing thousands of non cherry-picked samples, along with model weights and code.*
 
 As shown on the following figure, Jukebox is made of 3 `priors` which are decoder only models. They follow the architecture described in [Generating Long Sequences with Sparse Transformers](https://arxiv.org/abs/1904.10509), modified to support longer context length.
-First, a autoencoder is used to encode the text lyrics. Next, the first (also called `top_prior`) prior attends to the last hidden states extracted from the lyrics encoder. The priors are linked to the previous priors respectively via an `AudioConditioner` module. The`AudioConditioner` upsamples the outputs of the previous prior to raw tokens at a certain audio frame per second resolution. 
+First, a autoencoder is used to encode the text lyrics. Next, the first (also called `top_prior`) prior attends to the last hidden states extracted from the lyrics encoder. The priors are linked to the previous priors respectively via an `AudioConditioner` module. The`AudioConditioner` upsamples the outputs of the previous prior to raw tokens at a certain audio frame per second resolution.
 The metadata such as *artist, genre and timing* are passed to each prior, in the form of a start token and positional embedding for the timing data.  The hidden states are mapped to the closest codebook vector from the VQVAE in order to convert them to raw audio.
 
 ![JukeboxModel](https://gist.githubusercontent.com/ArthurZucker/92c1acaae62ebf1b6a951710bdd8b6af/raw/c9c517bf4eff61393f6c7dec9366ef02bdd059a3/jukebox.svg)
diff --git a/docs/source/en/model_doc/mega.md b/docs/source/en/model_doc/mega.md
index 4ce62ca45a1d74..5545f5e19c47e3 100644
--- a/docs/source/en/model_doc/mega.md
+++ b/docs/source/en/model_doc/mega.md
@@ -16,12 +16,20 @@ rendered properly in your Markdown viewer.
 
 # MEGA
 
+<Tip warning={true}>
+
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+</Tip>
+
 ## Overview
 
 The MEGA model was proposed in [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
-MEGA proposes a new approach to self-attention with each encoder layer having a multi-headed exponential moving average in addition to a single head of standard dot-product attention, giving the attention mechanism 
-stronger positional biases. This allows MEGA to perform competitively to Transformers on standard benchmarks including LRA 
-while also having significantly fewer parameters. MEGA's compute efficiency allows it to scale to very long sequences, making it an 
+MEGA proposes a new approach to self-attention with each encoder layer having a multi-headed exponential moving average in addition to a single head of standard dot-product attention, giving the attention mechanism
+stronger positional biases. This allows MEGA to perform competitively to Transformers on standard benchmarks including LRA
+while also having significantly fewer parameters. MEGA's compute efficiency allows it to scale to very long sequences, making it an
 attractive option for long-document NLP tasks.
 
 The abstract from the paper is the following:
@@ -34,8 +42,8 @@ The original code can be found [here](https://github.com/facebookresearch/mega).
 
 ## Usage tips
 
-- MEGA can perform quite well with relatively few parameters. See Appendix D in the MEGA paper for examples of architectural specs which perform well in various settings. If using MEGA as a decoder, be sure to set `bidirectional=False` to avoid errors with default bidirectional. 
-- Mega-chunk is a variant of mega that reduces time and spaces complexity from quadratic to linear. Utilize chunking with MegaConfig.use_chunking and control chunk size with MegaConfig.chunk_size 
+- MEGA can perform quite well with relatively few parameters. See Appendix D in the MEGA paper for examples of architectural specs which perform well in various settings. If using MEGA as a decoder, be sure to set `bidirectional=False` to avoid errors with default bidirectional.
+- Mega-chunk is a variant of mega that reduces time and spaces complexity from quadratic to linear. Utilize chunking with MegaConfig.use_chunking and control chunk size with MegaConfig.chunk_size
 
 
 ## Implementation Notes
diff --git a/docs/source/en/model_doc/nat.md b/docs/source/en/model_doc/nat.md
index ecb61ccb0a3397..02c2e466cc4a7b 100644
--- a/docs/source/en/model_doc/nat.md
+++ b/docs/source/en/model_doc/nat.md
@@ -16,6 +16,14 @@ rendered properly in your Markdown viewer.
 
 # Neighborhood Attention Transformer
 
+<Tip warning={true}>
+
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+</Tip>
+
 ## Overview
 
 NAT was proposed in [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143)
diff --git a/docs/source/en/model_doc/nezha.md b/docs/source/en/model_doc/nezha.md
index 872f576f1286eb..976722592cad22 100644
--- a/docs/source/en/model_doc/nezha.md
+++ b/docs/source/en/model_doc/nezha.md
@@ -16,6 +16,14 @@ rendered properly in your Markdown viewer.
 
 # Nezha
 
+<Tip warning={true}>
+
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+</Tip>
+
 ## Overview
 
 The Nezha model was proposed in [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei et al.
@@ -25,8 +33,8 @@ The abstract from the paper is the following:
 *The pre-trained language models have achieved great successes in various natural language understanding (NLU) tasks
 due to its capacity to capture the deep contextualized information in text by pre-training on large-scale corpora.
 In this technical report, we present our practice of pre-training language models named NEZHA (NEural contextualiZed
-representation for CHinese lAnguage understanding) on Chinese corpora and finetuning for the Chinese NLU tasks. 
-The current version of NEZHA is based on BERT with a collection of proven improvements, which include Functional 
+representation for CHinese lAnguage understanding) on Chinese corpora and finetuning for the Chinese NLU tasks.
+The current version of NEZHA is based on BERT with a collection of proven improvements, which include Functional
 Relative Positional Encoding as an effective positional encoding scheme, Whole Word Masking strategy,
 Mixed Precision Training and the LAMB Optimizer in training the models. The experimental results show that NEZHA
 achieves the state-of-the-art performances when finetuned on several representative Chinese tasks, including
@@ -85,4 +93,4 @@ This model was contributed by [sijunhe](https://huggingface.co/sijunhe). The ori
 ## NezhaForQuestionAnswering
 
 [[autodoc]] NezhaForQuestionAnswering
-    - forward
\ No newline at end of file
+    - forward
diff --git a/docs/source/en/model_doc/qdqbert.md b/docs/source/en/model_doc/qdqbert.md
index 19b829d0bc5d19..ca718f34af4a32 100644
--- a/docs/source/en/model_doc/qdqbert.md
+++ b/docs/source/en/model_doc/qdqbert.md
@@ -16,6 +16,14 @@ rendered properly in your Markdown viewer.
 
 # QDQBERT
 
+<Tip warning={true}>
+
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+</Tip>
+
 ## Overview
 
 The QDQBERT model can be referenced in [Integer Quantization for Deep Learning Inference: Principles and Empirical
diff --git a/docs/source/en/model_doc/realm.md b/docs/source/en/model_doc/realm.md
index a8227bc83c7318..558e83c08b06a6 100644
--- a/docs/source/en/model_doc/realm.md
+++ b/docs/source/en/model_doc/realm.md
@@ -16,6 +16,14 @@ rendered properly in your Markdown viewer.
 
 # REALM
 
+<Tip warning={true}>
+
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+</Tip>
+
 ## Overview
 
 The REALM model was proposed in [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. It's a
@@ -86,4 +94,4 @@ This model was contributed by [qqaatw](https://huggingface.co/qqaatw). The origi
 
 [[autodoc]] RealmForOpenQA
     - block_embedding_to
-    - forward
\ No newline at end of file
+    - forward
diff --git a/docs/source/en/model_doc/speech_to_text_2.md b/docs/source/en/model_doc/speech_to_text_2.md
index 6648e67f629d3c..fc2d0357c546c7 100644
--- a/docs/source/en/model_doc/speech_to_text_2.md
+++ b/docs/source/en/model_doc/speech_to_text_2.md
@@ -16,6 +16,14 @@ rendered properly in your Markdown viewer.
 
 # Speech2Text2
 
+  <Tip warning={true}>
+
+  This model is in maintenance mode only, we don't accept any new PRs changing its code.
+  If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+  You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+  </Tip>
+
 ## Overview
 
 The Speech2Text2 model is used together with [Wav2Vec2](wav2vec2) for Speech Translation models proposed in
diff --git a/docs/source/en/model_doc/tvlt.md b/docs/source/en/model_doc/tvlt.md
index f09ea8af863c9a..0a0f50e4731569 100644
--- a/docs/source/en/model_doc/tvlt.md
+++ b/docs/source/en/model_doc/tvlt.md
@@ -16,6 +16,14 @@ rendered properly in your Markdown viewer.
 
 # TVLT
 
+<Tip warning={true}>
+
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+</Tip>
+
 ## Overview
 
 The TVLT model was proposed in [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156)
@@ -60,7 +68,7 @@ The original code can be found [here](https://github.com/zinengtang/TVLT). This
 
 [[autodoc]] TvltFeatureExtractor
     - __call__
-    
+
 ## TvltModel
 
 [[autodoc]] TvltModel
diff --git a/docs/source/en/model_doc/vit_hybrid.md b/docs/source/en/model_doc/vit_hybrid.md
index ec98fc5e1ef8e0..5cde5e529807e0 100644
--- a/docs/source/en/model_doc/vit_hybrid.md
+++ b/docs/source/en/model_doc/vit_hybrid.md
@@ -16,6 +16,14 @@ rendered properly in your Markdown viewer.
 
 # Hybrid Vision Transformer (ViT Hybrid)
 
+<Tip warning={true}>
+
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+</Tip>
+
 ## Overview
 
 The hybrid Vision Transformer (ViT) model was proposed in [An Image is Worth 16x16 Words: Transformers for Image Recognition
diff --git a/docs/source/en/model_doc/xclip.md b/docs/source/en/model_doc/xclip.md
index 45c4c3db749be8..8c22747387c084 100644
--- a/docs/source/en/model_doc/xclip.md
+++ b/docs/source/en/model_doc/xclip.md
@@ -30,7 +30,7 @@ Tips:
 - Usage of X-CLIP is identical to [CLIP](clip).
 
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/xclip_architecture.png"
-alt="drawing" width="600"/> 
+alt="drawing" width="600"/>
 
 <small> X-CLIP architecture. Taken from the <a href="https://arxiv.org/abs/2208.02816">original paper.</a> </small>
 
diff --git a/docs/source/en/model_doc/xlm-prophetnet.md b/docs/source/en/model_doc/xlm-prophetnet.md
index 7a61aeb3e34a0a..b350cb554b0330 100644
--- a/docs/source/en/model_doc/xlm-prophetnet.md
+++ b/docs/source/en/model_doc/xlm-prophetnet.md
@@ -16,6 +16,14 @@ rendered properly in your Markdown viewer.
 
 # XLM-ProphetNet
 
+<Tip warning={true}>
+
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+</Tip>
+
 <div class="flex flex-wrap space-x-1">
 <a href="https://huggingface.co/models?filter=xprophetnet">
 <img alt="Models" src="https://img.shields.io/badge/All_model_pages-xprophetnet-blueviolet">
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index fc8f6b1a9c3d54..40b7905bfdbb04 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -321,17 +321,44 @@
     "models.deit": ["DeiTConfig"],
     "models.deprecated": [],
     "models.deprecated.bort": [],
+    "models.deprecated.deta": ["DetaConfig"],
+    "models.deprecated.efficientformer": ["EfficientFormerConfig"],
+    "models.deprecated.ernie_m": ["ErnieMConfig"],
+    "models.deprecated.gptsan_japanese": [
+        "GPTSanJapaneseConfig",
+        "GPTSanJapaneseTokenizer",
+    ],
+    "models.deprecated.graphormer": ["GraphormerConfig"],
+    "models.deprecated.jukebox": [
+        "JukeboxConfig",
+        "JukeboxPriorConfig",
+        "JukeboxTokenizer",
+        "JukeboxVQVAEConfig",
+    ],
     "models.deprecated.mctct": [
         "MCTCTConfig",
         "MCTCTFeatureExtractor",
         "MCTCTProcessor",
     ],
+    "models.deprecated.mega": ["MegaConfig"],
     "models.deprecated.mmbt": ["MMBTConfig"],
+    "models.deprecated.nat": ["NatConfig"],
+    "models.deprecated.nezha": ["NezhaConfig"],
     "models.deprecated.open_llama": ["OpenLlamaConfig"],
+    "models.deprecated.qdqbert": ["QDQBertConfig"],
+    "models.deprecated.realm": [
+        "RealmConfig",
+        "RealmTokenizer",
+    ],
     "models.deprecated.retribert": [
         "RetriBertConfig",
         "RetriBertTokenizer",
     ],
+    "models.deprecated.speech_to_text_2": [
+        "Speech2Text2Config",
+        "Speech2Text2Processor",
+        "Speech2Text2Tokenizer",
+    ],
     "models.deprecated.tapex": ["TapexTokenizer"],
     "models.deprecated.trajectory_transformer": ["TrajectoryTransformerConfig"],
     "models.deprecated.transfo_xl": [
@@ -339,9 +366,15 @@
         "TransfoXLCorpus",
         "TransfoXLTokenizer",
     ],
+    "models.deprecated.tvlt": [
+        "TvltConfig",
+        "TvltFeatureExtractor",
+        "TvltProcessor",
+    ],
     "models.deprecated.van": ["VanConfig"],
+    "models.deprecated.vit_hybrid": ["ViTHybridConfig"],
+    "models.deprecated.xlm_prophetnet": ["XLMProphetNetConfig"],
     "models.depth_anything": ["DepthAnythingConfig"],
-    "models.deta": ["DetaConfig"],
     "models.detr": ["DetrConfig"],
     "models.dialogpt": [],
     "models.dinat": ["DinatConfig"],
@@ -363,7 +396,6 @@
         "DPRReaderTokenizer",
     ],
     "models.dpt": ["DPTConfig"],
-    "models.efficientformer": ["EfficientFormerConfig"],
     "models.efficientnet": ["EfficientNetConfig"],
     "models.electra": [
         "ElectraConfig",
@@ -375,7 +407,6 @@
     ],
     "models.encoder_decoder": ["EncoderDecoderConfig"],
     "models.ernie": ["ErnieConfig"],
-    "models.ernie_m": ["ErnieMConfig"],
     "models.esm": ["EsmConfig", "EsmTokenizer"],
     "models.falcon": ["FalconConfig"],
     "models.fastspeech2_conformer": [
@@ -420,11 +451,6 @@
     "models.gpt_neox_japanese": ["GPTNeoXJapaneseConfig"],
     "models.gpt_sw3": [],
     "models.gptj": ["GPTJConfig"],
-    "models.gptsan_japanese": [
-        "GPTSanJapaneseConfig",
-        "GPTSanJapaneseTokenizer",
-    ],
-    "models.graphormer": ["GraphormerConfig"],
     "models.grounding_dino": [
         "GroundingDinoConfig",
         "GroundingDinoProcessor",
@@ -449,12 +475,6 @@
     ],
     "models.jamba": ["JambaConfig"],
     "models.jetmoe": ["JetMoeConfig"],
-    "models.jukebox": [
-        "JukeboxConfig",
-        "JukeboxPriorConfig",
-        "JukeboxTokenizer",
-        "JukeboxVQVAEConfig",
-    ],
     "models.kosmos2": [
         "Kosmos2Config",
         "Kosmos2Processor",
@@ -519,7 +539,6 @@
     ],
     "models.mbart": ["MBartConfig"],
     "models.mbart50": [],
-    "models.mega": ["MegaConfig"],
     "models.megatron_bert": ["MegatronBertConfig"],
     "models.megatron_gpt2": [],
     "models.mgp_str": [
@@ -554,8 +573,6 @@
         "MusicgenMelodyDecoderConfig",
     ],
     "models.mvp": ["MvpConfig", "MvpTokenizer"],
-    "models.nat": ["NatConfig"],
-    "models.nezha": ["NezhaConfig"],
     "models.nllb": [],
     "models.nllb_moe": ["NllbMoeConfig"],
     "models.nougat": ["NougatProcessor"],
@@ -613,17 +630,12 @@
     ],
     "models.pvt": ["PvtConfig"],
     "models.pvt_v2": ["PvtV2Config"],
-    "models.qdqbert": ["QDQBertConfig"],
     "models.qwen2": [
         "Qwen2Config",
         "Qwen2Tokenizer",
     ],
     "models.qwen2_moe": ["Qwen2MoeConfig"],
     "models.rag": ["RagConfig", "RagRetriever", "RagTokenizer"],
-    "models.realm": [
-        "RealmConfig",
-        "RealmTokenizer",
-    ],
     "models.recurrent_gemma": ["RecurrentGemmaConfig"],
     "models.reformer": ["ReformerConfig"],
     "models.regnet": ["RegNetConfig"],
@@ -672,11 +684,6 @@
         "Speech2TextFeatureExtractor",
         "Speech2TextProcessor",
     ],
-    "models.speech_to_text_2": [
-        "Speech2Text2Config",
-        "Speech2Text2Processor",
-        "Speech2Text2Tokenizer",
-    ],
     "models.speecht5": [
         "SpeechT5Config",
         "SpeechT5FeatureExtractor",
@@ -712,11 +719,6 @@
         "TrOCRConfig",
         "TrOCRProcessor",
     ],
-    "models.tvlt": [
-        "TvltConfig",
-        "TvltFeatureExtractor",
-        "TvltProcessor",
-    ],
     "models.tvp": [
         "TvpConfig",
         "TvpProcessor",
@@ -749,7 +751,6 @@
     ],
     "models.visual_bert": ["VisualBertConfig"],
     "models.vit": ["ViTConfig"],
-    "models.vit_hybrid": ["ViTHybridConfig"],
     "models.vit_mae": ["ViTMAEConfig"],
     "models.vit_msn": ["ViTMSNConfig"],
     "models.vitdet": ["VitDetConfig"],
@@ -788,7 +789,6 @@
     ],
     "models.xglm": ["XGLMConfig"],
     "models.xlm": ["XLMConfig", "XLMTokenizer"],
-    "models.xlm_prophetnet": ["XLMProphetNetConfig"],
     "models.xlm_roberta": ["XLMRobertaConfig"],
     "models.xlm_roberta_xl": ["XLMRobertaXLConfig"],
     "models.xlnet": ["XLNetConfig"],
@@ -943,7 +943,8 @@
     _import_structure["models.code_llama"].append("CodeLlamaTokenizer")
     _import_structure["models.cpm"].append("CpmTokenizer")
     _import_structure["models.deberta_v2"].append("DebertaV2Tokenizer")
-    _import_structure["models.ernie_m"].append("ErnieMTokenizer")
+    _import_structure["models.deprecated.ernie_m"].append("ErnieMTokenizer")
+    _import_structure["models.deprecated.xlm_prophetnet"].append("XLMProphetNetTokenizer")
     _import_structure["models.fnet"].append("FNetTokenizer")
     _import_structure["models.gemma"].append("GemmaTokenizer")
     _import_structure["models.gpt_sw3"].append("GPTSw3Tokenizer")
@@ -967,7 +968,6 @@
     _import_structure["models.t5"].append("T5Tokenizer")
     _import_structure["models.udop"].append("UdopTokenizer")
     _import_structure["models.xglm"].append("XGLMTokenizer")
-    _import_structure["models.xlm_prophetnet"].append("XLMProphetNetTokenizer")
     _import_structure["models.xlm_roberta"].append("XLMRobertaTokenizer")
     _import_structure["models.xlnet"].append("XLNetTokenizer")
 
@@ -1000,6 +1000,7 @@
     _import_structure["models.cpm"].append("CpmTokenizerFast")
     _import_structure["models.deberta"].append("DebertaTokenizerFast")
     _import_structure["models.deberta_v2"].append("DebertaV2TokenizerFast")
+    _import_structure["models.deprecated.realm"].append("RealmTokenizerFast")
     _import_structure["models.deprecated.retribert"].append("RetriBertTokenizerFast")
     _import_structure["models.distilbert"].append("DistilBertTokenizerFast")
     _import_structure["models.dpr"].extend(
@@ -1037,7 +1038,6 @@
     _import_structure["models.openai"].append("OpenAIGPTTokenizerFast")
     _import_structure["models.pegasus"].append("PegasusTokenizerFast")
     _import_structure["models.qwen2"].append("Qwen2TokenizerFast")
-    _import_structure["models.realm"].append("RealmTokenizerFast")
     _import_structure["models.reformer"].append("ReformerTokenizerFast")
     _import_structure["models.rembert"].append("RemBertTokenizerFast")
     _import_structure["models.roberta"].append("RobertaTokenizerFast")
@@ -1122,11 +1122,13 @@
         ["DeformableDetrFeatureExtractor", "DeformableDetrImageProcessor"]
     )
     _import_structure["models.deit"].extend(["DeiTFeatureExtractor", "DeiTImageProcessor"])
-    _import_structure["models.deta"].append("DetaImageProcessor")
+    _import_structure["models.deprecated.deta"].append("DetaImageProcessor")
+    _import_structure["models.deprecated.efficientformer"].append("EfficientFormerImageProcessor")
+    _import_structure["models.deprecated.tvlt"].append("TvltImageProcessor")
+    _import_structure["models.deprecated.vit_hybrid"].extend(["ViTHybridImageProcessor"])
     _import_structure["models.detr"].extend(["DetrFeatureExtractor", "DetrImageProcessor"])
     _import_structure["models.donut"].extend(["DonutFeatureExtractor", "DonutImageProcessor"])
     _import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"])
-    _import_structure["models.efficientformer"].append("EfficientFormerImageProcessor")
     _import_structure["models.efficientnet"].append("EfficientNetImageProcessor")
     _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"])
     _import_structure["models.fuyu"].extend(["FuyuImageProcessor", "FuyuProcessor"])
@@ -1158,13 +1160,11 @@
     _import_structure["models.siglip"].append("SiglipImageProcessor")
     _import_structure["models.superpoint"].extend(["SuperPointImageProcessor"])
     _import_structure["models.swin2sr"].append("Swin2SRImageProcessor")
-    _import_structure["models.tvlt"].append("TvltImageProcessor")
     _import_structure["models.tvp"].append("TvpImageProcessor")
     _import_structure["models.video_llava"].append("VideoLlavaImageProcessor")
     _import_structure["models.videomae"].extend(["VideoMAEFeatureExtractor", "VideoMAEImageProcessor"])
     _import_structure["models.vilt"].extend(["ViltFeatureExtractor", "ViltImageProcessor", "ViltProcessor"])
     _import_structure["models.vit"].extend(["ViTFeatureExtractor", "ViTImageProcessor"])
-    _import_structure["models.vit_hybrid"].extend(["ViTHybridImageProcessor"])
     _import_structure["models.vitmatte"].append("VitMatteImageProcessor")
     _import_structure["models.vivit"].append("VivitImageProcessor")
     _import_structure["models.yolos"].extend(["YolosFeatureExtractor", "YolosImageProcessor"])
@@ -1767,6 +1767,54 @@
             "DeiTPreTrainedModel",
         ]
     )
+    _import_structure["models.deprecated.deta"].extend(
+        [
+            "DetaForObjectDetection",
+            "DetaModel",
+            "DetaPreTrainedModel",
+        ]
+    )
+    _import_structure["models.deprecated.efficientformer"].extend(
+        [
+            "EfficientFormerForImageClassification",
+            "EfficientFormerForImageClassificationWithTeacher",
+            "EfficientFormerModel",
+            "EfficientFormerPreTrainedModel",
+        ]
+    )
+    _import_structure["models.deprecated.ernie_m"].extend(
+        [
+            "ErnieMForInformationExtraction",
+            "ErnieMForMultipleChoice",
+            "ErnieMForQuestionAnswering",
+            "ErnieMForSequenceClassification",
+            "ErnieMForTokenClassification",
+            "ErnieMModel",
+            "ErnieMPreTrainedModel",
+        ]
+    )
+    _import_structure["models.deprecated.gptsan_japanese"].extend(
+        [
+            "GPTSanJapaneseForConditionalGeneration",
+            "GPTSanJapaneseModel",
+            "GPTSanJapanesePreTrainedModel",
+        ]
+    )
+    _import_structure["models.deprecated.graphormer"].extend(
+        [
+            "GraphormerForGraphClassification",
+            "GraphormerModel",
+            "GraphormerPreTrainedModel",
+        ]
+    )
+    _import_structure["models.deprecated.jukebox"].extend(
+        [
+            "JukeboxModel",
+            "JukeboxPreTrainedModel",
+            "JukeboxPrior",
+            "JukeboxVQVAE",
+        ]
+    )
     _import_structure["models.deprecated.mctct"].extend(
         [
             "MCTCTForCTC",
@@ -1774,7 +1822,40 @@
             "MCTCTPreTrainedModel",
         ]
     )
+    _import_structure["models.deprecated.mega"].extend(
+        [
+            "MegaForCausalLM",
+            "MegaForMaskedLM",
+            "MegaForMultipleChoice",
+            "MegaForQuestionAnswering",
+            "MegaForSequenceClassification",
+            "MegaForTokenClassification",
+            "MegaModel",
+            "MegaPreTrainedModel",
+        ]
+    )
     _import_structure["models.deprecated.mmbt"].extend(["MMBTForClassification", "MMBTModel", "ModalEmbeddings"])
+    _import_structure["models.deprecated.nat"].extend(
+        [
+            "NatBackbone",
+            "NatForImageClassification",
+            "NatModel",
+            "NatPreTrainedModel",
+        ]
+    )
+    _import_structure["models.deprecated.nezha"].extend(
+        [
+            "NezhaForMaskedLM",
+            "NezhaForMultipleChoice",
+            "NezhaForNextSentencePrediction",
+            "NezhaForPreTraining",
+            "NezhaForQuestionAnswering",
+            "NezhaForSequenceClassification",
+            "NezhaForTokenClassification",
+            "NezhaModel",
+            "NezhaPreTrainedModel",
+        ]
+    )
     _import_structure["models.deprecated.open_llama"].extend(
         [
             "OpenLlamaForCausalLM",
@@ -1783,12 +1864,42 @@
             "OpenLlamaPreTrainedModel",
         ]
     )
+    _import_structure["models.deprecated.qdqbert"].extend(
+        [
+            "QDQBertForMaskedLM",
+            "QDQBertForMultipleChoice",
+            "QDQBertForNextSentencePrediction",
+            "QDQBertForQuestionAnswering",
+            "QDQBertForSequenceClassification",
+            "QDQBertForTokenClassification",
+            "QDQBertLayer",
+            "QDQBertLMHeadModel",
+            "QDQBertModel",
+            "QDQBertPreTrainedModel",
+            "load_tf_weights_in_qdqbert",
+        ]
+    )
+    _import_structure["models.deprecated.realm"].extend(
+        [
+            "RealmEmbedder",
+            "RealmForOpenQA",
+            "RealmKnowledgeAugEncoder",
+            "RealmPreTrainedModel",
+            "RealmReader",
+            "RealmRetriever",
+            "RealmScorer",
+            "load_tf_weights_in_realm",
+        ]
+    )
     _import_structure["models.deprecated.retribert"].extend(
         [
             "RetriBertModel",
             "RetriBertPreTrainedModel",
         ]
     )
+    _import_structure["models.deprecated.speech_to_text_2"].extend(
+        ["Speech2Text2ForCausalLM", "Speech2Text2PreTrainedModel"]
+    )
     _import_structure["models.deprecated.trajectory_transformer"].extend(
         [
             "TrajectoryTransformerModel",
@@ -1805,6 +1916,14 @@
             "load_tf_weights_in_transfo_xl",
         ]
     )
+    _import_structure["models.deprecated.tvlt"].extend(
+        [
+            "TvltForAudioVisualClassification",
+            "TvltForPreTraining",
+            "TvltModel",
+            "TvltPreTrainedModel",
+        ]
+    )
     _import_structure["models.deprecated.van"].extend(
         [
             "VanForImageClassification",
@@ -1812,17 +1931,27 @@
             "VanPreTrainedModel",
         ]
     )
-    _import_structure["models.depth_anything"].extend(
+    _import_structure["models.deprecated.vit_hybrid"].extend(
         [
-            "DepthAnythingForDepthEstimation",
-            "DepthAnythingPreTrainedModel",
+            "ViTHybridForImageClassification",
+            "ViTHybridModel",
+            "ViTHybridPreTrainedModel",
         ]
     )
-    _import_structure["models.deta"].extend(
+    _import_structure["models.deprecated.xlm_prophetnet"].extend(
         [
-            "DetaForObjectDetection",
-            "DetaModel",
-            "DetaPreTrainedModel",
+            "XLMProphetNetDecoder",
+            "XLMProphetNetEncoder",
+            "XLMProphetNetForCausalLM",
+            "XLMProphetNetForConditionalGeneration",
+            "XLMProphetNetModel",
+            "XLMProphetNetPreTrainedModel",
+        ]
+    )
+    _import_structure["models.depth_anything"].extend(
+        [
+            "DepthAnythingForDepthEstimation",
+            "DepthAnythingPreTrainedModel",
         ]
     )
     _import_structure["models.detr"].extend(
@@ -1885,14 +2014,6 @@
             "DPTPreTrainedModel",
         ]
     )
-    _import_structure["models.efficientformer"].extend(
-        [
-            "EfficientFormerForImageClassification",
-            "EfficientFormerForImageClassificationWithTeacher",
-            "EfficientFormerModel",
-            "EfficientFormerPreTrainedModel",
-        ]
-    )
     _import_structure["models.efficientnet"].extend(
         [
             "EfficientNetForImageClassification",
@@ -1935,17 +2056,6 @@
             "ErniePreTrainedModel",
         ]
     )
-    _import_structure["models.ernie_m"].extend(
-        [
-            "ErnieMForInformationExtraction",
-            "ErnieMForMultipleChoice",
-            "ErnieMForQuestionAnswering",
-            "ErnieMForSequenceClassification",
-            "ErnieMForTokenClassification",
-            "ErnieMModel",
-            "ErnieMPreTrainedModel",
-        ]
-    )
     _import_structure["models.esm"].extend(
         [
             "EsmFoldPreTrainedModel",
@@ -2121,20 +2231,6 @@
             "GPTJPreTrainedModel",
         ]
     )
-    _import_structure["models.gptsan_japanese"].extend(
-        [
-            "GPTSanJapaneseForConditionalGeneration",
-            "GPTSanJapaneseModel",
-            "GPTSanJapanesePreTrainedModel",
-        ]
-    )
-    _import_structure["models.graphormer"].extend(
-        [
-            "GraphormerForGraphClassification",
-            "GraphormerModel",
-            "GraphormerPreTrainedModel",
-        ]
-    )
     _import_structure["models.grounding_dino"].extend(
         [
             "GroundingDinoForObjectDetection",
@@ -2225,14 +2321,6 @@
             "JetMoePreTrainedModel",
         ]
     )
-    _import_structure["models.jukebox"].extend(
-        [
-            "JukeboxModel",
-            "JukeboxPreTrainedModel",
-            "JukeboxPrior",
-            "JukeboxVQVAE",
-        ]
-    )
     _import_structure["models.kosmos2"].extend(
         [
             "Kosmos2ForConditionalGeneration",
@@ -2410,18 +2498,6 @@
             "MBartPreTrainedModel",
         ]
     )
-    _import_structure["models.mega"].extend(
-        [
-            "MegaForCausalLM",
-            "MegaForMaskedLM",
-            "MegaForMultipleChoice",
-            "MegaForQuestionAnswering",
-            "MegaForSequenceClassification",
-            "MegaForTokenClassification",
-            "MegaModel",
-            "MegaPreTrainedModel",
-        ]
-    )
     _import_structure["models.megatron_bert"].extend(
         [
             "MegatronBertForCausalLM",
@@ -2580,27 +2656,6 @@
             "MvpPreTrainedModel",
         ]
     )
-    _import_structure["models.nat"].extend(
-        [
-            "NatBackbone",
-            "NatForImageClassification",
-            "NatModel",
-            "NatPreTrainedModel",
-        ]
-    )
-    _import_structure["models.nezha"].extend(
-        [
-            "NezhaForMaskedLM",
-            "NezhaForMultipleChoice",
-            "NezhaForNextSentencePrediction",
-            "NezhaForPreTraining",
-            "NezhaForQuestionAnswering",
-            "NezhaForSequenceClassification",
-            "NezhaForTokenClassification",
-            "NezhaModel",
-            "NezhaPreTrainedModel",
-        ]
-    )
     _import_structure["models.nllb_moe"].extend(
         [
             "NllbMoeForConditionalGeneration",
@@ -2811,21 +2866,6 @@
             "PvtV2PreTrainedModel",
         ]
     )
-    _import_structure["models.qdqbert"].extend(
-        [
-            "QDQBertForMaskedLM",
-            "QDQBertForMultipleChoice",
-            "QDQBertForNextSentencePrediction",
-            "QDQBertForQuestionAnswering",
-            "QDQBertForSequenceClassification",
-            "QDQBertForTokenClassification",
-            "QDQBertLayer",
-            "QDQBertLMHeadModel",
-            "QDQBertModel",
-            "QDQBertPreTrainedModel",
-            "load_tf_weights_in_qdqbert",
-        ]
-    )
     _import_structure["models.qwen2"].extend(
         [
             "Qwen2ForCausalLM",
@@ -2852,18 +2892,6 @@
             "RagTokenForGeneration",
         ]
     )
-    _import_structure["models.realm"].extend(
-        [
-            "RealmEmbedder",
-            "RealmForOpenQA",
-            "RealmKnowledgeAugEncoder",
-            "RealmPreTrainedModel",
-            "RealmReader",
-            "RealmRetriever",
-            "RealmScorer",
-            "load_tf_weights_in_realm",
-        ]
-    )
     _import_structure["models.recurrent_gemma"].extend(
         [
             "RecurrentGemmaForCausalLM",
@@ -3052,7 +3080,6 @@
             "Speech2TextPreTrainedModel",
         ]
     )
-    _import_structure["models.speech_to_text_2"].extend(["Speech2Text2ForCausalLM", "Speech2Text2PreTrainedModel"])
     _import_structure["models.speecht5"].extend(
         [
             "SpeechT5ForSpeechToSpeech",
@@ -3200,14 +3227,6 @@
             "TrOCRPreTrainedModel",
         ]
     )
-    _import_structure["models.tvlt"].extend(
-        [
-            "TvltForAudioVisualClassification",
-            "TvltForPreTraining",
-            "TvltModel",
-            "TvltPreTrainedModel",
-        ]
-    )
     _import_structure["models.tvp"].extend(
         [
             "TvpForVideoGrounding",
@@ -3320,13 +3339,6 @@
             "ViTPreTrainedModel",
         ]
     )
-    _import_structure["models.vit_hybrid"].extend(
-        [
-            "ViTHybridForImageClassification",
-            "ViTHybridModel",
-            "ViTHybridPreTrainedModel",
-        ]
-    )
     _import_structure["models.vit_mae"].extend(
         [
             "ViTMAEForPreTraining",
@@ -3447,16 +3459,6 @@
             "XLMWithLMHeadModel",
         ]
     )
-    _import_structure["models.xlm_prophetnet"].extend(
-        [
-            "XLMProphetNetDecoder",
-            "XLMProphetNetEncoder",
-            "XLMProphetNetForCausalLM",
-            "XLMProphetNetForConditionalGeneration",
-            "XLMProphetNetModel",
-            "XLMProphetNetPreTrainedModel",
-        ]
-    )
     _import_structure["models.xlm_roberta"].extend(
         [
             "XLMRobertaForCausalLM",
@@ -3799,6 +3801,14 @@
             "TFDeiTPreTrainedModel",
         ]
     )
+    _import_structure["models.deprecated.efficientformer"].extend(
+        [
+            "TFEfficientFormerForImageClassification",
+            "TFEfficientFormerForImageClassificationWithTeacher",
+            "TFEfficientFormerModel",
+            "TFEfficientFormerPreTrainedModel",
+        ]
+    )
     _import_structure["models.deprecated.transfo_xl"].extend(
         [
             "TFAdaptiveEmbedding",
@@ -3831,14 +3841,6 @@
             "TFDPRReader",
         ]
     )
-    _import_structure["models.efficientformer"].extend(
-        [
-            "TFEfficientFormerForImageClassification",
-            "TFEfficientFormerForImageClassificationWithTeacher",
-            "TFEfficientFormerModel",
-            "TFEfficientFormerPreTrainedModel",
-        ]
-    )
     _import_structure["models.electra"].extend(
         [
             "TFElectraForMaskedLM",
@@ -4888,19 +4890,48 @@
         DeformableDetrConfig,
     )
     from .models.deit import DeiTConfig
+    from .models.deprecated.deta import DetaConfig
+    from .models.deprecated.efficientformer import (
+        EfficientFormerConfig,
+    )
+    from .models.deprecated.ernie_m import ErnieMConfig
+    from .models.deprecated.gptsan_japanese import (
+        GPTSanJapaneseConfig,
+        GPTSanJapaneseTokenizer,
+    )
+    from .models.deprecated.graphormer import GraphormerConfig
+    from .models.deprecated.jukebox import (
+        JukeboxConfig,
+        JukeboxPriorConfig,
+        JukeboxTokenizer,
+        JukeboxVQVAEConfig,
+    )
     from .models.deprecated.mctct import (
         MCTCTConfig,
         MCTCTFeatureExtractor,
         MCTCTProcessor,
     )
+    from .models.deprecated.mega import MegaConfig
     from .models.deprecated.mmbt import MMBTConfig
+    from .models.deprecated.nat import NatConfig
+    from .models.deprecated.nezha import NezhaConfig
     from .models.deprecated.open_llama import (
         OpenLlamaConfig,
     )
+    from .models.deprecated.qdqbert import QDQBertConfig
+    from .models.deprecated.realm import (
+        RealmConfig,
+        RealmTokenizer,
+    )
     from .models.deprecated.retribert import (
         RetriBertConfig,
         RetriBertTokenizer,
     )
+    from .models.deprecated.speech_to_text_2 import (
+        Speech2Text2Config,
+        Speech2Text2Processor,
+        Speech2Text2Tokenizer,
+    )
     from .models.deprecated.tapex import TapexTokenizer
     from .models.deprecated.trajectory_transformer import (
         TrajectoryTransformerConfig,
@@ -4910,9 +4941,19 @@
         TransfoXLCorpus,
         TransfoXLTokenizer,
     )
+    from .models.deprecated.tvlt import (
+        TvltConfig,
+        TvltFeatureExtractor,
+        TvltProcessor,
+    )
     from .models.deprecated.van import VanConfig
+    from .models.deprecated.vit_hybrid import (
+        ViTHybridConfig,
+    )
+    from .models.deprecated.xlm_prophetnet import (
+        XLMProphetNetConfig,
+    )
     from .models.depth_anything import DepthAnythingConfig
-    from .models.deta import DetaConfig
     from .models.detr import DetrConfig
     from .models.dinat import DinatConfig
     from .models.dinov2 import Dinov2Config
@@ -4932,9 +4973,6 @@
         DPRReaderTokenizer,
     )
     from .models.dpt import DPTConfig
-    from .models.efficientformer import (
-        EfficientFormerConfig,
-    )
     from .models.efficientnet import (
         EfficientNetConfig,
     )
@@ -4948,7 +4986,6 @@
     )
     from .models.encoder_decoder import EncoderDecoderConfig
     from .models.ernie import ErnieConfig
-    from .models.ernie_m import ErnieMConfig
     from .models.esm import EsmConfig, EsmTokenizer
     from .models.falcon import FalconConfig
     from .models.fastspeech2_conformer import (
@@ -4996,11 +5033,6 @@
         GPTNeoXJapaneseConfig,
     )
     from .models.gptj import GPTJConfig
-    from .models.gptsan_japanese import (
-        GPTSanJapaneseConfig,
-        GPTSanJapaneseTokenizer,
-    )
-    from .models.graphormer import GraphormerConfig
     from .models.grounding_dino import (
         GroundingDinoConfig,
         GroundingDinoProcessor,
@@ -5027,12 +5059,6 @@
     )
     from .models.jamba import JambaConfig
     from .models.jetmoe import JetMoeConfig
-    from .models.jukebox import (
-        JukeboxConfig,
-        JukeboxPriorConfig,
-        JukeboxTokenizer,
-        JukeboxVQVAEConfig,
-    )
     from .models.kosmos2 import (
         Kosmos2Config,
         Kosmos2Processor,
@@ -5098,7 +5124,6 @@
         MaskFormerSwinConfig,
     )
     from .models.mbart import MBartConfig
-    from .models.mega import MegaConfig
     from .models.megatron_bert import (
         MegatronBertConfig,
     )
@@ -5141,8 +5166,6 @@
         MusicgenMelodyDecoderConfig,
     )
     from .models.mvp import MvpConfig, MvpTokenizer
-    from .models.nat import NatConfig
-    from .models.nezha import NezhaConfig
     from .models.nllb_moe import NllbMoeConfig
     from .models.nougat import NougatProcessor
     from .models.nystromformer import (
@@ -5213,14 +5236,9 @@
     )
     from .models.pvt import PvtConfig
     from .models.pvt_v2 import PvtV2Config
-    from .models.qdqbert import QDQBertConfig
     from .models.qwen2 import Qwen2Config, Qwen2Tokenizer
     from .models.qwen2_moe import Qwen2MoeConfig
     from .models.rag import RagConfig, RagRetriever, RagTokenizer
-    from .models.realm import (
-        RealmConfig,
-        RealmTokenizer,
-    )
     from .models.recurrent_gemma import RecurrentGemmaConfig
     from .models.reformer import ReformerConfig
     from .models.regnet import RegNetConfig
@@ -5273,11 +5291,6 @@
         Speech2TextFeatureExtractor,
         Speech2TextProcessor,
     )
-    from .models.speech_to_text_2 import (
-        Speech2Text2Config,
-        Speech2Text2Processor,
-        Speech2Text2Tokenizer,
-    )
     from .models.speecht5 import (
         SpeechT5Config,
         SpeechT5FeatureExtractor,
@@ -5323,11 +5336,6 @@
         TrOCRConfig,
         TrOCRProcessor,
     )
-    from .models.tvlt import (
-        TvltConfig,
-        TvltFeatureExtractor,
-        TvltProcessor,
-    )
     from .models.tvp import (
         TvpConfig,
         TvpProcessor,
@@ -5365,9 +5373,6 @@
         VisualBertConfig,
     )
     from .models.vit import ViTConfig
-    from .models.vit_hybrid import (
-        ViTHybridConfig,
-    )
     from .models.vit_mae import ViTMAEConfig
     from .models.vit_msn import ViTMSNConfig
     from .models.vitdet import VitDetConfig
@@ -5408,9 +5413,6 @@
     )
     from .models.xglm import XGLMConfig
     from .models.xlm import XLMConfig, XLMTokenizer
-    from .models.xlm_prophetnet import (
-        XLMProphetNetConfig,
-    )
     from .models.xlm_roberta import (
         XLMRobertaConfig,
     )
@@ -5570,7 +5572,8 @@
         from .models.code_llama import CodeLlamaTokenizer
         from .models.cpm import CpmTokenizer
         from .models.deberta_v2 import DebertaV2Tokenizer
-        from .models.ernie_m import ErnieMTokenizer
+        from .models.deprecated.ernie_m import ErnieMTokenizer
+        from .models.deprecated.xlm_prophetnet import XLMProphetNetTokenizer
         from .models.fnet import FNetTokenizer
         from .models.gemma import GemmaTokenizer
         from .models.gpt_sw3 import GPTSw3Tokenizer
@@ -5593,7 +5596,6 @@
         from .models.t5 import T5Tokenizer
         from .models.udop import UdopTokenizer
         from .models.xglm import XGLMTokenizer
-        from .models.xlm_prophetnet import XLMProphetNetTokenizer
         from .models.xlm_roberta import XLMRobertaTokenizer
         from .models.xlnet import XLNetTokenizer
 
@@ -5621,6 +5623,7 @@
         from .models.cpm import CpmTokenizerFast
         from .models.deberta import DebertaTokenizerFast
         from .models.deberta_v2 import DebertaV2TokenizerFast
+        from .models.deprecated.realm import RealmTokenizerFast
         from .models.deprecated.retribert import RetriBertTokenizerFast
         from .models.distilbert import DistilBertTokenizerFast
         from .models.dpr import (
@@ -5656,7 +5659,6 @@
         from .models.openai import OpenAIGPTTokenizerFast
         from .models.pegasus import PegasusTokenizerFast
         from .models.qwen2 import Qwen2TokenizerFast
-        from .models.realm import RealmTokenizerFast
         from .models.reformer import ReformerTokenizerFast
         from .models.rembert import RemBertTokenizerFast
         from .models.roberta import RobertaTokenizerFast
@@ -5726,11 +5728,13 @@
             DeformableDetrImageProcessor,
         )
         from .models.deit import DeiTFeatureExtractor, DeiTImageProcessor
-        from .models.deta import DetaImageProcessor
+        from .models.deprecated.deta import DetaImageProcessor
+        from .models.deprecated.efficientformer import EfficientFormerImageProcessor
+        from .models.deprecated.tvlt import TvltImageProcessor
+        from .models.deprecated.vit_hybrid import ViTHybridImageProcessor
         from .models.detr import DetrFeatureExtractor, DetrImageProcessor
         from .models.donut import DonutFeatureExtractor, DonutImageProcessor
         from .models.dpt import DPTFeatureExtractor, DPTImageProcessor
-        from .models.efficientformer import EfficientFormerImageProcessor
         from .models.efficientnet import EfficientNetImageProcessor
         from .models.flava import (
             FlavaFeatureExtractor,
@@ -5784,13 +5788,11 @@
         from .models.siglip import SiglipImageProcessor
         from .models.superpoint import SuperPointImageProcessor
         from .models.swin2sr import Swin2SRImageProcessor
-        from .models.tvlt import TvltImageProcessor
         from .models.tvp import TvpImageProcessor
         from .models.video_llava import VideoLlavaImageProcessor
         from .models.videomae import VideoMAEFeatureExtractor, VideoMAEImageProcessor
         from .models.vilt import ViltFeatureExtractor, ViltImageProcessor, ViltProcessor
         from .models.vit import ViTFeatureExtractor, ViTImageProcessor
-        from .models.vit_hybrid import ViTHybridImageProcessor
         from .models.vitmatte import VitMatteImageProcessor
         from .models.vivit import VivitImageProcessor
         from .models.yolos import YolosFeatureExtractor, YolosImageProcessor
@@ -6300,26 +6302,116 @@
             DeiTModel,
             DeiTPreTrainedModel,
         )
+        from .models.deprecated.deta import (
+            DetaForObjectDetection,
+            DetaModel,
+            DetaPreTrainedModel,
+        )
+        from .models.deprecated.efficientformer import (
+            EfficientFormerForImageClassification,
+            EfficientFormerForImageClassificationWithTeacher,
+            EfficientFormerModel,
+            EfficientFormerPreTrainedModel,
+        )
+        from .models.deprecated.ernie_m import (
+            ErnieMForInformationExtraction,
+            ErnieMForMultipleChoice,
+            ErnieMForQuestionAnswering,
+            ErnieMForSequenceClassification,
+            ErnieMForTokenClassification,
+            ErnieMModel,
+            ErnieMPreTrainedModel,
+        )
+        from .models.deprecated.gptsan_japanese import (
+            GPTSanJapaneseForConditionalGeneration,
+            GPTSanJapaneseModel,
+            GPTSanJapanesePreTrainedModel,
+        )
+        from .models.deprecated.graphormer import (
+            GraphormerForGraphClassification,
+            GraphormerModel,
+            GraphormerPreTrainedModel,
+        )
+        from .models.deprecated.jukebox import (
+            JukeboxModel,
+            JukeboxPreTrainedModel,
+            JukeboxPrior,
+            JukeboxVQVAE,
+        )
         from .models.deprecated.mctct import (
             MCTCTForCTC,
             MCTCTModel,
             MCTCTPreTrainedModel,
         )
+        from .models.deprecated.mega import (
+            MegaForCausalLM,
+            MegaForMaskedLM,
+            MegaForMultipleChoice,
+            MegaForQuestionAnswering,
+            MegaForSequenceClassification,
+            MegaForTokenClassification,
+            MegaModel,
+            MegaPreTrainedModel,
+        )
         from .models.deprecated.mmbt import (
             MMBTForClassification,
             MMBTModel,
             ModalEmbeddings,
         )
+        from .models.deprecated.nat import (
+            NatBackbone,
+            NatForImageClassification,
+            NatModel,
+            NatPreTrainedModel,
+        )
+        from .models.deprecated.nezha import (
+            NezhaForMaskedLM,
+            NezhaForMultipleChoice,
+            NezhaForNextSentencePrediction,
+            NezhaForPreTraining,
+            NezhaForQuestionAnswering,
+            NezhaForSequenceClassification,
+            NezhaForTokenClassification,
+            NezhaModel,
+            NezhaPreTrainedModel,
+        )
         from .models.deprecated.open_llama import (
             OpenLlamaForCausalLM,
             OpenLlamaForSequenceClassification,
             OpenLlamaModel,
             OpenLlamaPreTrainedModel,
         )
+        from .models.deprecated.qdqbert import (
+            QDQBertForMaskedLM,
+            QDQBertForMultipleChoice,
+            QDQBertForNextSentencePrediction,
+            QDQBertForQuestionAnswering,
+            QDQBertForSequenceClassification,
+            QDQBertForTokenClassification,
+            QDQBertLayer,
+            QDQBertLMHeadModel,
+            QDQBertModel,
+            QDQBertPreTrainedModel,
+            load_tf_weights_in_qdqbert,
+        )
+        from .models.deprecated.realm import (
+            RealmEmbedder,
+            RealmForOpenQA,
+            RealmKnowledgeAugEncoder,
+            RealmPreTrainedModel,
+            RealmReader,
+            RealmRetriever,
+            RealmScorer,
+            load_tf_weights_in_realm,
+        )
         from .models.deprecated.retribert import (
             RetriBertModel,
             RetriBertPreTrainedModel,
         )
+        from .models.deprecated.speech_to_text_2 import (
+            Speech2Text2ForCausalLM,
+            Speech2Text2PreTrainedModel,
+        )
         from .models.deprecated.trajectory_transformer import (
             TrajectoryTransformerModel,
             TrajectoryTransformerPreTrainedModel,
@@ -6332,20 +6424,34 @@
             TransfoXLPreTrainedModel,
             load_tf_weights_in_transfo_xl,
         )
+        from .models.deprecated.tvlt import (
+            TvltForAudioVisualClassification,
+            TvltForPreTraining,
+            TvltModel,
+            TvltPreTrainedModel,
+        )
         from .models.deprecated.van import (
             VanForImageClassification,
             VanModel,
             VanPreTrainedModel,
         )
+        from .models.deprecated.vit_hybrid import (
+            ViTHybridForImageClassification,
+            ViTHybridModel,
+            ViTHybridPreTrainedModel,
+        )
+        from .models.deprecated.xlm_prophetnet import (
+            XLMProphetNetDecoder,
+            XLMProphetNetEncoder,
+            XLMProphetNetForCausalLM,
+            XLMProphetNetForConditionalGeneration,
+            XLMProphetNetModel,
+            XLMProphetNetPreTrainedModel,
+        )
         from .models.depth_anything import (
             DepthAnythingForDepthEstimation,
             DepthAnythingPreTrainedModel,
         )
-        from .models.deta import (
-            DetaForObjectDetection,
-            DetaModel,
-            DetaPreTrainedModel,
-        )
         from .models.detr import (
             DetrForObjectDetection,
             DetrForSegmentation,
@@ -6392,12 +6498,6 @@
             DPTModel,
             DPTPreTrainedModel,
         )
-        from .models.efficientformer import (
-            EfficientFormerForImageClassification,
-            EfficientFormerForImageClassificationWithTeacher,
-            EfficientFormerModel,
-            EfficientFormerPreTrainedModel,
-        )
         from .models.efficientnet import (
             EfficientNetForImageClassification,
             EfficientNetModel,
@@ -6432,15 +6532,6 @@
             ErnieModel,
             ErniePreTrainedModel,
         )
-        from .models.ernie_m import (
-            ErnieMForInformationExtraction,
-            ErnieMForMultipleChoice,
-            ErnieMForQuestionAnswering,
-            ErnieMForSequenceClassification,
-            ErnieMForTokenClassification,
-            ErnieMModel,
-            ErnieMPreTrainedModel,
-        )
         from .models.esm import (
             EsmFoldPreTrainedModel,
             EsmForMaskedLM,
@@ -6589,16 +6680,6 @@
             GPTJModel,
             GPTJPreTrainedModel,
         )
-        from .models.gptsan_japanese import (
-            GPTSanJapaneseForConditionalGeneration,
-            GPTSanJapaneseModel,
-            GPTSanJapanesePreTrainedModel,
-        )
-        from .models.graphormer import (
-            GraphormerForGraphClassification,
-            GraphormerModel,
-            GraphormerPreTrainedModel,
-        )
         from .models.grounding_dino import (
             GroundingDinoForObjectDetection,
             GroundingDinoModel,
@@ -6667,12 +6748,6 @@
             JetMoeModel,
             JetMoePreTrainedModel,
         )
-        from .models.jukebox import (
-            JukeboxModel,
-            JukeboxPreTrainedModel,
-            JukeboxPrior,
-            JukeboxVQVAE,
-        )
         from .models.kosmos2 import (
             Kosmos2ForConditionalGeneration,
             Kosmos2Model,
@@ -6810,16 +6885,6 @@
             MBartModel,
             MBartPreTrainedModel,
         )
-        from .models.mega import (
-            MegaForCausalLM,
-            MegaForMaskedLM,
-            MegaForMultipleChoice,
-            MegaForQuestionAnswering,
-            MegaForSequenceClassification,
-            MegaForTokenClassification,
-            MegaModel,
-            MegaPreTrainedModel,
-        )
         from .models.megatron_bert import (
             MegatronBertForCausalLM,
             MegatronBertForMaskedLM,
@@ -6946,23 +7011,6 @@
             MvpModel,
             MvpPreTrainedModel,
         )
-        from .models.nat import (
-            NatBackbone,
-            NatForImageClassification,
-            NatModel,
-            NatPreTrainedModel,
-        )
-        from .models.nezha import (
-            NezhaForMaskedLM,
-            NezhaForMultipleChoice,
-            NezhaForNextSentencePrediction,
-            NezhaForPreTraining,
-            NezhaForQuestionAnswering,
-            NezhaForSequenceClassification,
-            NezhaForTokenClassification,
-            NezhaModel,
-            NezhaPreTrainedModel,
-        )
         from .models.nllb_moe import (
             NllbMoeForConditionalGeneration,
             NllbMoeModel,
@@ -7125,19 +7173,6 @@
             PvtV2Model,
             PvtV2PreTrainedModel,
         )
-        from .models.qdqbert import (
-            QDQBertForMaskedLM,
-            QDQBertForMultipleChoice,
-            QDQBertForNextSentencePrediction,
-            QDQBertForQuestionAnswering,
-            QDQBertForSequenceClassification,
-            QDQBertForTokenClassification,
-            QDQBertLayer,
-            QDQBertLMHeadModel,
-            QDQBertModel,
-            QDQBertPreTrainedModel,
-            load_tf_weights_in_qdqbert,
-        )
         from .models.qwen2 import (
             Qwen2ForCausalLM,
             Qwen2ForSequenceClassification,
@@ -7158,16 +7193,6 @@
             RagSequenceForGeneration,
             RagTokenForGeneration,
         )
-        from .models.realm import (
-            RealmEmbedder,
-            RealmForOpenQA,
-            RealmKnowledgeAugEncoder,
-            RealmPreTrainedModel,
-            RealmReader,
-            RealmRetriever,
-            RealmScorer,
-            load_tf_weights_in_realm,
-        )
         from .models.recurrent_gemma import (
             RecurrentGemmaForCausalLM,
             RecurrentGemmaModel,
@@ -7318,10 +7343,6 @@
             Speech2TextModel,
             Speech2TextPreTrainedModel,
         )
-        from .models.speech_to_text_2 import (
-            Speech2Text2ForCausalLM,
-            Speech2Text2PreTrainedModel,
-        )
         from .models.speecht5 import (
             SpeechT5ForSpeechToSpeech,
             SpeechT5ForSpeechToText,
@@ -7435,12 +7456,6 @@
             TrOCRForCausalLM,
             TrOCRPreTrainedModel,
         )
-        from .models.tvlt import (
-            TvltForAudioVisualClassification,
-            TvltForPreTraining,
-            TvltModel,
-            TvltPreTrainedModel,
-        )
         from .models.tvp import (
             TvpForVideoGrounding,
             TvpModel,
@@ -7525,11 +7540,6 @@
             ViTModel,
             ViTPreTrainedModel,
         )
-        from .models.vit_hybrid import (
-            ViTHybridForImageClassification,
-            ViTHybridModel,
-            ViTHybridPreTrainedModel,
-        )
         from .models.vit_mae import (
             ViTMAEForPreTraining,
             ViTMAELayer,
@@ -7622,14 +7632,6 @@
             XLMPreTrainedModel,
             XLMWithLMHeadModel,
         )
-        from .models.xlm_prophetnet import (
-            XLMProphetNetDecoder,
-            XLMProphetNetEncoder,
-            XLMProphetNetForCausalLM,
-            XLMProphetNetForConditionalGeneration,
-            XLMProphetNetModel,
-            XLMProphetNetPreTrainedModel,
-        )
         from .models.xlm_roberta import (
             XLMRobertaForCausalLM,
             XLMRobertaForMaskedLM,
@@ -7921,6 +7923,12 @@
             TFDeiTModel,
             TFDeiTPreTrainedModel,
         )
+        from .models.deprecated.efficientformer import (
+            TFEfficientFormerForImageClassification,
+            TFEfficientFormerForImageClassificationWithTeacher,
+            TFEfficientFormerModel,
+            TFEfficientFormerPreTrainedModel,
+        )
         from .models.deprecated.transfo_xl import (
             TFAdaptiveEmbedding,
             TFTransfoXLForSequenceClassification,
@@ -7947,12 +7955,6 @@
             TFDPRQuestionEncoder,
             TFDPRReader,
         )
-        from .models.efficientformer import (
-            TFEfficientFormerForImageClassification,
-            TFEfficientFormerForImageClassificationWithTeacher,
-            TFEfficientFormerModel,
-            TFEfficientFormerPreTrainedModel,
-        )
         from .models.electra import (
             TFElectraForMaskedLM,
             TFElectraForMultipleChoice,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 72e2d892ec8107..24b602f18c8f38 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -67,7 +67,6 @@
     deit,
     deprecated,
     depth_anything,
-    deta,
     detr,
     dialogpt,
     dinat,
@@ -77,13 +76,11 @@
     donut,
     dpr,
     dpt,
-    efficientformer,
     efficientnet,
     electra,
     encodec,
     encoder_decoder,
     ernie,
-    ernie_m,
     esm,
     falcon,
     fastspeech2_conformer,
@@ -104,8 +101,6 @@
     gpt_neox_japanese,
     gpt_sw3,
     gptj,
-    gptsan_japanese,
-    graphormer,
     grounding_dino,
     groupvit,
     herbert,
@@ -118,7 +113,6 @@
     instructblip,
     jamba,
     jetmoe,
-    jukebox,
     kosmos2,
     layoutlm,
     layoutlmv2,
@@ -142,7 +136,6 @@
     maskformer,
     mbart,
     mbart50,
-    mega,
     megatron_bert,
     megatron_gpt2,
     mgp_str,
@@ -161,8 +154,6 @@
     musicgen,
     musicgen_melody,
     mvp,
-    nat,
-    nezha,
     nllb,
     nllb_moe,
     nougat,
@@ -190,11 +181,9 @@
     prophetnet,
     pvt,
     pvt_v2,
-    qdqbert,
     qwen2,
     qwen2_moe,
     rag,
-    realm,
     recurrent_gemma,
     reformer,
     regnet,
@@ -215,7 +204,6 @@
     siglip,
     speech_encoder_decoder,
     speech_to_text,
-    speech_to_text_2,
     speecht5,
     splinter,
     squeezebert,
@@ -234,7 +222,6 @@
     timesformer,
     timm_backbone,
     trocr,
-    tvlt,
     tvp,
     udop,
     umt5,
@@ -250,7 +237,6 @@
     vision_text_dual_encoder,
     visual_bert,
     vit,
-    vit_hybrid,
     vit_mae,
     vit_msn,
     vitdet,
@@ -267,7 +253,6 @@
     x_clip,
     xglm,
     xlm,
-    xlm_prophetnet,
     xlm_roberta,
     xlm_roberta_xl,
     xlnet,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 464f80e2d95128..40e282166ef99e 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -585,14 +585,29 @@
 # `transfo-xl` (as in `CONFIG_MAPPING_NAMES`), we should use `transfo_xl`.
 DEPRECATED_MODELS = [
     "bort",
+    "deta",
+    "efficientformer",
+    "ernie_m",
+    "gptsan_japanese",
+    "graphormer",
+    "jukebox",
     "mctct",
+    "mega",
     "mmbt",
+    "nat",
+    "nezha",
     "open_llama",
+    "qdqbert",
+    "realm",
     "retribert",
+    "speech_to_text_2",
     "tapex",
     "trajectory_transformer",
     "transfo_xl",
+    "tvlt",
     "van",
+    "vit_hybrid",
+    "xlm_prophetnet",
 ]
 
 SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict(
@@ -616,7 +631,11 @@ def model_type_to_module_name(key):
     """Converts a config key to the corresponding module."""
     # Special treatment
     if key in SPECIAL_MODEL_TYPE_TO_MODULE_NAME:
-        return SPECIAL_MODEL_TYPE_TO_MODULE_NAME[key]
+        key = SPECIAL_MODEL_TYPE_TO_MODULE_NAME[key]
+
+        if key in DEPRECATED_MODELS:
+            key = f"deprecated.{key}"
+        return key
 
     key = key.replace("-", "_")
     if key in DEPRECATED_MODELS:
diff --git a/src/transformers/models/deta/__init__.py b/src/transformers/models/deprecated/deta/__init__.py
similarity index 94%
rename from src/transformers/models/deta/__init__.py
rename to src/transformers/models/deprecated/deta/__init__.py
index 843a4dc4d803d9..ab54ec6f4391e3 100644
--- a/src/transformers/models/deta/__init__.py
+++ b/src/transformers/models/deprecated/deta/__init__.py
@@ -14,7 +14,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/deta/configuration_deta.py b/src/transformers/models/deprecated/deta/configuration_deta.py
similarity index 99%
rename from src/transformers/models/deta/configuration_deta.py
rename to src/transformers/models/deprecated/deta/configuration_deta.py
index d7fe7eadc73378..fcee8fc62abf50 100644
--- a/src/transformers/models/deta/configuration_deta.py
+++ b/src/transformers/models/deprecated/deta/configuration_deta.py
@@ -14,9 +14,9 @@
 # limitations under the License.
 """DETA model configuration"""
 
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
-from ..auto import CONFIG_MAPPING
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+from ...auto import CONFIG_MAPPING
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/deta/convert_deta_resnet_to_pytorch.py b/src/transformers/models/deprecated/deta/convert_deta_resnet_to_pytorch.py
similarity index 100%
rename from src/transformers/models/deta/convert_deta_resnet_to_pytorch.py
rename to src/transformers/models/deprecated/deta/convert_deta_resnet_to_pytorch.py
diff --git a/src/transformers/models/deta/convert_deta_swin_to_pytorch.py b/src/transformers/models/deprecated/deta/convert_deta_swin_to_pytorch.py
similarity index 100%
rename from src/transformers/models/deta/convert_deta_swin_to_pytorch.py
rename to src/transformers/models/deprecated/deta/convert_deta_swin_to_pytorch.py
diff --git a/src/transformers/models/deta/image_processing_deta.py b/src/transformers/models/deprecated/deta/image_processing_deta.py
similarity index 99%
rename from src/transformers/models/deta/image_processing_deta.py
rename to src/transformers/models/deprecated/deta/image_processing_deta.py
index a73eedba2c57eb..57a9584397df76 100644
--- a/src/transformers/models/deta/image_processing_deta.py
+++ b/src/transformers/models/deprecated/deta/image_processing_deta.py
@@ -19,9 +19,9 @@
 
 import numpy as np
 
-from ...feature_extraction_utils import BatchFeature
-from ...image_processing_utils import BaseImageProcessor, get_size_dict
-from ...image_transforms import (
+from ....feature_extraction_utils import BatchFeature
+from ....image_processing_utils import BaseImageProcessor, get_size_dict
+from ....image_transforms import (
     PaddingMode,
     center_to_corners_format,
     corners_to_center_format,
@@ -31,7 +31,7 @@
     rgb_to_id,
     to_channel_dimension_format,
 )
-from ...image_utils import (
+from ....image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
     AnnotationFormat,
@@ -48,7 +48,7 @@
     validate_annotations,
     validate_preprocess_arguments,
 )
-from ...utils import (
+from ....utils import (
     is_flax_available,
     is_jax_tensor,
     is_tf_available,
@@ -59,7 +59,7 @@
     is_vision_available,
     logging,
 )
-from ...utils.generic import TensorType
+from ....utils.generic import TensorType
 
 
 if is_torch_available():
diff --git a/src/transformers/models/deta/modeling_deta.py b/src/transformers/models/deprecated/deta/modeling_deta.py
similarity index 99%
rename from src/transformers/models/deta/modeling_deta.py
rename to src/transformers/models/deprecated/deta/modeling_deta.py
index fcd8fd82b680bd..03341f0ab8dcbf 100644
--- a/src/transformers/models/deta/modeling_deta.py
+++ b/src/transformers/models/deprecated/deta/modeling_deta.py
@@ -28,8 +28,8 @@
 from torch.autograd import Function
 from torch.autograd.function import once_differentiable
 
-from ...activations import ACT2FN
-from ...file_utils import (
+from ....activations import ACT2FN
+from ....file_utils import (
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -38,12 +38,12 @@
     is_vision_available,
     replace_return_docstrings,
 )
-from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
-from ...modeling_outputs import BaseModelOutput
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import meshgrid
-from ...utils import is_accelerate_available, is_ninja_available, is_torchvision_available, logging, requires_backends
-from ...utils.backbone_utils import load_backbone
+from ....modeling_attn_mask_utils import _prepare_4d_attention_mask
+from ....modeling_outputs import BaseModelOutput
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import meshgrid
+from ....utils import is_accelerate_available, is_ninja_available, is_torchvision_available, logging, requires_backends
+from ....utils.backbone_utils import load_backbone
 from .configuration_deta import DetaConfig
 
 
diff --git a/src/transformers/models/efficientformer/__init__.py b/src/transformers/models/deprecated/efficientformer/__init__.py
similarity index 99%
rename from src/transformers/models/efficientformer/__init__.py
rename to src/transformers/models/deprecated/efficientformer/__init__.py
index 9b36518587cf44..67d046a8b6fc56 100644
--- a/src/transformers/models/efficientformer/__init__.py
+++ b/src/transformers/models/deprecated/efficientformer/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import (
+from ....utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
     is_tf_available,
diff --git a/src/transformers/models/efficientformer/configuration_efficientformer.py b/src/transformers/models/deprecated/efficientformer/configuration_efficientformer.py
similarity index 98%
rename from src/transformers/models/efficientformer/configuration_efficientformer.py
rename to src/transformers/models/deprecated/efficientformer/configuration_efficientformer.py
index a9fbfa7e0575f1..fb161d61fcbcdb 100644
--- a/src/transformers/models/efficientformer/configuration_efficientformer.py
+++ b/src/transformers/models/deprecated/efficientformer/configuration_efficientformer.py
@@ -16,8 +16,8 @@
 
 from typing import List
 
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py
similarity index 100%
rename from src/transformers/models/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py
rename to src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py
diff --git a/src/transformers/models/efficientformer/image_processing_efficientformer.py b/src/transformers/models/deprecated/efficientformer/image_processing_efficientformer.py
similarity index 98%
rename from src/transformers/models/efficientformer/image_processing_efficientformer.py
rename to src/transformers/models/deprecated/efficientformer/image_processing_efficientformer.py
index 38756f7c958f5d..15fdf04051c1fb 100644
--- a/src/transformers/models/efficientformer/image_processing_efficientformer.py
+++ b/src/transformers/models/deprecated/efficientformer/image_processing_efficientformer.py
@@ -18,13 +18,13 @@
 
 import numpy as np
 
-from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import (
+from ....image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ....image_transforms import (
     get_resize_output_image_size,
     resize,
     to_channel_dimension_format,
 )
-from ...image_utils import (
+from ....image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
     ChannelDimension,
@@ -38,7 +38,7 @@
     validate_kwargs,
     validate_preprocess_arguments,
 )
-from ...utils import TensorType, logging
+from ....utils import TensorType, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/efficientformer/modeling_efficientformer.py b/src/transformers/models/deprecated/efficientformer/modeling_efficientformer.py
similarity index 99%
rename from src/transformers/models/efficientformer/modeling_efficientformer.py
rename to src/transformers/models/deprecated/efficientformer/modeling_efficientformer.py
index 44d2adbed4be88..461490c7f5790e 100644
--- a/src/transformers/models/efficientformer/modeling_efficientformer.py
+++ b/src/transformers/models/deprecated/efficientformer/modeling_efficientformer.py
@@ -23,10 +23,10 @@
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
-from ...activations import ACT2FN
-from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
-from ...modeling_utils import PreTrainedModel
-from ...utils import (
+from ....activations import ACT2FN
+from ....modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
+from ....modeling_utils import PreTrainedModel
+from ....utils import (
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
diff --git a/src/transformers/models/efficientformer/modeling_tf_efficientformer.py b/src/transformers/models/deprecated/efficientformer/modeling_tf_efficientformer.py
similarity index 99%
rename from src/transformers/models/efficientformer/modeling_tf_efficientformer.py
rename to src/transformers/models/deprecated/efficientformer/modeling_tf_efficientformer.py
index a8ce9dd3068e7c..d47d06e7837c44 100644
--- a/src/transformers/models/efficientformer/modeling_tf_efficientformer.py
+++ b/src/transformers/models/deprecated/efficientformer/modeling_tf_efficientformer.py
@@ -20,13 +20,13 @@
 
 import tensorflow as tf
 
-from ...activations_tf import ACT2FN
-from ...modeling_tf_outputs import (
+from ....activations_tf import ACT2FN
+from ....modeling_tf_outputs import (
     TFBaseModelOutput,
     TFBaseModelOutputWithPooling,
     TFImageClassifierOutput,
 )
-from ...modeling_tf_utils import (
+from ....modeling_tf_utils import (
     TFPreTrainedModel,
     TFSequenceClassificationLoss,
     get_initializer,
@@ -34,8 +34,8 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
-from ...utils import (
+from ....tf_utils import shape_list, stable_softmax
+from ....utils import (
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
diff --git a/src/transformers/models/ernie_m/__init__.py b/src/transformers/models/deprecated/ernie_m/__init__.py
similarity index 95%
rename from src/transformers/models/ernie_m/__init__.py
rename to src/transformers/models/deprecated/ernie_m/__init__.py
index fc7076e4394552..68964d7574fc53 100644
--- a/src/transformers/models/ernie_m/__init__.py
+++ b/src/transformers/models/deprecated/ernie_m/__init__.py
@@ -14,7 +14,7 @@
 from typing import TYPE_CHECKING
 
 # rely on isort to merge the imports
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available, is_torch_available
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/ernie_m/configuration_ernie_m.py b/src/transformers/models/deprecated/ernie_m/configuration_ernie_m.py
similarity index 99%
rename from src/transformers/models/ernie_m/configuration_ernie_m.py
rename to src/transformers/models/deprecated/ernie_m/configuration_ernie_m.py
index cf34a510f22679..d5c3feb951a317 100644
--- a/src/transformers/models/ernie_m/configuration_ernie_m.py
+++ b/src/transformers/models/deprecated/ernie_m/configuration_ernie_m.py
@@ -19,7 +19,7 @@
 
 from typing import Dict
 
-from ...configuration_utils import PretrainedConfig
+from ....configuration_utils import PretrainedConfig
 
 
 class ErnieMConfig(PretrainedConfig):
diff --git a/src/transformers/models/ernie_m/modeling_ernie_m.py b/src/transformers/models/deprecated/ernie_m/modeling_ernie_m.py
similarity index 99%
rename from src/transformers/models/ernie_m/modeling_ernie_m.py
rename to src/transformers/models/deprecated/ernie_m/modeling_ernie_m.py
index 6b977801fe2bf4..d8349ee5aa4400 100755
--- a/src/transformers/models/ernie_m/modeling_ernie_m.py
+++ b/src/transformers/models/deprecated/ernie_m/modeling_ernie_m.py
@@ -22,8 +22,8 @@
 from torch import nn, tensor
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
-from ...activations import ACT2FN
-from ...modeling_outputs import (
+from ....activations import ACT2FN
+from ....modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     BaseModelOutputWithPoolingAndCrossAttentions,
     MultipleChoiceModelOutput,
@@ -31,9 +31,9 @@
     SequenceClassifierOutput,
     TokenClassifierOutput,
 )
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ....utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_ernie_m import ErnieMConfig
 
 
diff --git a/src/transformers/models/ernie_m/tokenization_ernie_m.py b/src/transformers/models/deprecated/ernie_m/tokenization_ernie_m.py
similarity index 99%
rename from src/transformers/models/ernie_m/tokenization_ernie_m.py
rename to src/transformers/models/deprecated/ernie_m/tokenization_ernie_m.py
index 0bd7edea1cab3a..07f9f4ed47384c 100644
--- a/src/transformers/models/ernie_m/tokenization_ernie_m.py
+++ b/src/transformers/models/deprecated/ernie_m/tokenization_ernie_m.py
@@ -21,8 +21,8 @@
 
 import sentencepiece as spm
 
-from ...tokenization_utils import PreTrainedTokenizer
-from ...utils import logging
+from ....tokenization_utils import PreTrainedTokenizer
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/gptsan_japanese/__init__.py b/src/transformers/models/deprecated/gptsan_japanese/__init__.py
similarity index 98%
rename from src/transformers/models/gptsan_japanese/__init__.py
rename to src/transformers/models/deprecated/gptsan_japanese/__init__.py
index 9ae8af34667e34..5bd0f99840ca9c 100644
--- a/src/transformers/models/gptsan_japanese/__init__.py
+++ b/src/transformers/models/deprecated/gptsan_japanese/__init__.py
@@ -14,7 +14,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...utils import (
+from ....utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
     is_flax_available,
diff --git a/src/transformers/models/gptsan_japanese/configuration_gptsan_japanese.py b/src/transformers/models/deprecated/gptsan_japanese/configuration_gptsan_japanese.py
similarity index 98%
rename from src/transformers/models/gptsan_japanese/configuration_gptsan_japanese.py
rename to src/transformers/models/deprecated/gptsan_japanese/configuration_gptsan_japanese.py
index 23295f3510359c..52bd33ac9ff3d6 100644
--- a/src/transformers/models/gptsan_japanese/configuration_gptsan_japanese.py
+++ b/src/transformers/models/deprecated/gptsan_japanese/configuration_gptsan_japanese.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 """GPTSAN-japanese model configuration"""
 
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py
similarity index 100%
rename from src/transformers/models/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py
rename to src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py
diff --git a/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py b/src/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py
similarity index 99%
rename from src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
rename to src/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py
index 7faafd9efbbcdd..5129c1091ba3e2 100644
--- a/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
+++ b/src/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py
@@ -20,10 +20,10 @@
 import torch
 import torch.nn as nn
 
-from ...activations import ACT2FN
-from ...modeling_outputs import MoECausalLMOutputWithPast, MoEModelOutputWithPastAndCrossAttentions
-from ...modeling_utils import PreTrainedModel
-from ...utils import (
+from ....activations import ACT2FN
+from ....modeling_outputs import MoECausalLMOutputWithPast, MoEModelOutputWithPastAndCrossAttentions
+from ....modeling_utils import PreTrainedModel
+from ....utils import (
     DUMMY_INPUTS,
     DUMMY_MASK,
     add_start_docstrings,
diff --git a/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py b/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py
similarity index 99%
rename from src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py
rename to src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py
index 56756f3c3282cc..e86aa47c1afece 100644
--- a/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py
+++ b/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py
@@ -22,8 +22,8 @@
 
 import numpy as np
 
-from ...tokenization_utils import PreTrainedTokenizer
-from ...tokenization_utils_base import (
+from ....tokenization_utils import PreTrainedTokenizer
+from ....tokenization_utils_base import (
     BatchEncoding,
     PreTokenizedInput,
     PreTokenizedInputPair,
@@ -31,7 +31,7 @@
     TextInputPair,
     TruncationStrategy,
 )
-from ...utils import PaddingStrategy, logging
+from ....utils import PaddingStrategy, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/graphormer/__init__.py b/src/transformers/models/deprecated/graphormer/__init__.py
similarity index 93%
rename from src/transformers/models/graphormer/__init__.py
rename to src/transformers/models/deprecated/graphormer/__init__.py
index f8140c81c1bb9b..117bf7c15a8a9b 100644
--- a/src/transformers/models/graphormer/__init__.py
+++ b/src/transformers/models/deprecated/graphormer/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/graphormer/algos_graphormer.pyx b/src/transformers/models/deprecated/graphormer/algos_graphormer.pyx
similarity index 100%
rename from src/transformers/models/graphormer/algos_graphormer.pyx
rename to src/transformers/models/deprecated/graphormer/algos_graphormer.pyx
diff --git a/src/transformers/models/graphormer/collating_graphormer.py b/src/transformers/models/deprecated/graphormer/collating_graphormer.py
similarity index 98%
rename from src/transformers/models/graphormer/collating_graphormer.py
rename to src/transformers/models/deprecated/graphormer/collating_graphormer.py
index 58ce602ea28de1..1c2342913d63ff 100644
--- a/src/transformers/models/graphormer/collating_graphormer.py
+++ b/src/transformers/models/deprecated/graphormer/collating_graphormer.py
@@ -6,7 +6,7 @@
 import numpy as np
 import torch
 
-from ...utils import is_cython_available, requires_backends
+from ....utils import is_cython_available, requires_backends
 
 
 if is_cython_available():
diff --git a/src/transformers/models/graphormer/configuration_graphormer.py b/src/transformers/models/deprecated/graphormer/configuration_graphormer.py
similarity index 99%
rename from src/transformers/models/graphormer/configuration_graphormer.py
rename to src/transformers/models/deprecated/graphormer/configuration_graphormer.py
index 9f6904ef387e96..058ef9d03a407e 100644
--- a/src/transformers/models/graphormer/configuration_graphormer.py
+++ b/src/transformers/models/deprecated/graphormer/configuration_graphormer.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 """Graphormer model configuration"""
 
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/graphormer/modeling_graphormer.py b/src/transformers/models/deprecated/graphormer/modeling_graphormer.py
similarity index 99%
rename from src/transformers/models/graphormer/modeling_graphormer.py
rename to src/transformers/models/deprecated/graphormer/modeling_graphormer.py
index f2696a586ba410..0eb4aa71194c9e 100755
--- a/src/transformers/models/graphormer/modeling_graphormer.py
+++ b/src/transformers/models/deprecated/graphormer/modeling_graphormer.py
@@ -21,13 +21,13 @@
 import torch.nn as nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
-from ...activations import ACT2FN
-from ...modeling_outputs import (
+from ....activations import ACT2FN
+from ....modeling_outputs import (
     BaseModelOutputWithNoAttention,
     SequenceClassifierOutput,
 )
-from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ....modeling_utils import PreTrainedModel
+from ....utils import logging
 from .configuration_graphormer import GraphormerConfig
 
 
diff --git a/src/transformers/models/jukebox/__init__.py b/src/transformers/models/deprecated/jukebox/__init__.py
similarity index 95%
rename from src/transformers/models/jukebox/__init__.py
rename to src/transformers/models/deprecated/jukebox/__init__.py
index 441b11329cf8ff..d6de90638905d3 100644
--- a/src/transformers/models/jukebox/__init__.py
+++ b/src/transformers/models/deprecated/jukebox/__init__.py
@@ -14,7 +14,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/deprecated/jukebox/configuration_jukebox.py
similarity index 99%
rename from src/transformers/models/jukebox/configuration_jukebox.py
rename to src/transformers/models/deprecated/jukebox/configuration_jukebox.py
index a2eee03885b9b1..e9d08c478f30f3 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/deprecated/jukebox/configuration_jukebox.py
@@ -17,8 +17,8 @@
 import os
 from typing import List, Union
 
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/jukebox/convert_jukebox.py b/src/transformers/models/deprecated/jukebox/convert_jukebox.py
similarity index 100%
rename from src/transformers/models/jukebox/convert_jukebox.py
rename to src/transformers/models/deprecated/jukebox/convert_jukebox.py
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/deprecated/jukebox/modeling_jukebox.py
similarity index 99%
rename from src/transformers/models/jukebox/modeling_jukebox.py
rename to src/transformers/models/deprecated/jukebox/modeling_jukebox.py
index 9af8dbd6847170..6688c79e71a20f 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/deprecated/jukebox/modeling_jukebox.py
@@ -24,10 +24,10 @@
 from torch import nn
 from torch.nn import LayerNorm as FusedLayerNorm
 
-from ...activations import ACT2FN
-from ...modeling_utils import PreTrainedModel
-from ...utils import add_start_docstrings, logging
-from ...utils.logging import tqdm
+from ....activations import ACT2FN
+from ....modeling_utils import PreTrainedModel
+from ....utils import add_start_docstrings, logging
+from ....utils.logging import tqdm
 from .configuration_jukebox import ATTENTION_PATTERNS, JukeboxConfig, JukeboxPriorConfig, JukeboxVQVAEConfig
 
 
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/deprecated/jukebox/tokenization_jukebox.py
similarity index 98%
rename from src/transformers/models/jukebox/tokenization_jukebox.py
rename to src/transformers/models/deprecated/jukebox/tokenization_jukebox.py
index 4952adda64e690..fb827fbca9b48b 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/deprecated/jukebox/tokenization_jukebox.py
@@ -24,10 +24,10 @@
 import numpy as np
 import regex
 
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
-from ...tokenization_utils_base import BatchEncoding
-from ...utils import TensorType, is_flax_available, is_tf_available, is_torch_available, logging
-from ...utils.generic import _is_jax, _is_numpy
+from ....tokenization_utils import AddedToken, PreTrainedTokenizer
+from ....tokenization_utils_base import BatchEncoding
+from ....utils import TensorType, is_flax_available, is_tf_available, is_torch_available, logging
+from ....utils.generic import _is_jax, _is_numpy
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/mega/__init__.py b/src/transformers/models/deprecated/mega/__init__.py
similarity index 98%
rename from src/transformers/models/mega/__init__.py
rename to src/transformers/models/deprecated/mega/__init__.py
index 3e3b204d8b1727..1774d3bae4eaab 100644
--- a/src/transformers/models/mega/__init__.py
+++ b/src/transformers/models/deprecated/mega/__init__.py
@@ -14,7 +14,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...utils import (
+from ....utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
     is_torch_available,
diff --git a/src/transformers/models/mega/configuration_mega.py b/src/transformers/models/deprecated/mega/configuration_mega.py
similarity index 99%
rename from src/transformers/models/mega/configuration_mega.py
rename to src/transformers/models/deprecated/mega/configuration_mega.py
index b090a020afb0ec..0b1ab53d5f65d9 100644
--- a/src/transformers/models/mega/configuration_mega.py
+++ b/src/transformers/models/deprecated/mega/configuration_mega.py
@@ -17,9 +17,9 @@
 from collections import OrderedDict
 from typing import Mapping
 
-from ...configuration_utils import PretrainedConfig
-from ...onnx import OnnxConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....onnx import OnnxConfig
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py
similarity index 100%
rename from src/transformers/models/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py
rename to src/transformers/models/deprecated/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py
diff --git a/src/transformers/models/mega/modeling_mega.py b/src/transformers/models/deprecated/mega/modeling_mega.py
similarity index 99%
rename from src/transformers/models/mega/modeling_mega.py
rename to src/transformers/models/deprecated/mega/modeling_mega.py
index 65fff1cd49735a..92d91bdb28bb2d 100644
--- a/src/transformers/models/mega/modeling_mega.py
+++ b/src/transformers/models/deprecated/mega/modeling_mega.py
@@ -23,8 +23,8 @@
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
-from ...activations import ACT2FN
-from ...modeling_outputs import (
+from ....activations import ACT2FN
+from ....modeling_outputs import (
     BaseModelOutputWithPoolingAndCrossAttentions,
     CausalLMOutputWithCrossAttentions,
     MaskedLMOutput,
@@ -33,9 +33,9 @@
     SequenceClassifierOutput,
     TokenClassifierOutput,
 )
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import ALL_LAYERNORM_LAYERS
-from ...utils import (
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import ALL_LAYERNORM_LAYERS
+from ....utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
diff --git a/src/transformers/models/nat/__init__.py b/src/transformers/models/deprecated/nat/__init__.py
similarity index 94%
rename from src/transformers/models/nat/__init__.py
rename to src/transformers/models/deprecated/nat/__init__.py
index bcf05ddf41ed9b..70d2cfd2951a0d 100644
--- a/src/transformers/models/nat/__init__.py
+++ b/src/transformers/models/deprecated/nat/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
 _import_structure = {"configuration_nat": ["NatConfig"]}
diff --git a/src/transformers/models/nat/configuration_nat.py b/src/transformers/models/deprecated/nat/configuration_nat.py
similarity index 97%
rename from src/transformers/models/nat/configuration_nat.py
rename to src/transformers/models/deprecated/nat/configuration_nat.py
index b20a60ac1ee1cd..2fef74d2a016bc 100644
--- a/src/transformers/models/nat/configuration_nat.py
+++ b/src/transformers/models/deprecated/nat/configuration_nat.py
@@ -14,9 +14,9 @@
 # limitations under the License.
 """Neighborhood Attention Transformer model configuration"""
 
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
-from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+from ....utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/nat/modeling_nat.py b/src/transformers/models/deprecated/nat/modeling_nat.py
similarity index 99%
rename from src/transformers/models/nat/modeling_nat.py
rename to src/transformers/models/deprecated/nat/modeling_nat.py
index fa518010096097..58d92ada0b1543 100644
--- a/src/transformers/models/nat/modeling_nat.py
+++ b/src/transformers/models/deprecated/nat/modeling_nat.py
@@ -23,11 +23,11 @@
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
-from ...activations import ACT2FN
-from ...modeling_outputs import BackboneOutput
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import (
+from ....activations import ACT2FN
+from ....modeling_outputs import BackboneOutput
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ....utils import (
     ModelOutput,
     OptionalDependencyNotAvailable,
     add_code_sample_docstrings,
@@ -38,7 +38,7 @@
     replace_return_docstrings,
     requires_backends,
 )
-from ...utils.backbone_utils import BackboneMixin
+from ....utils.backbone_utils import BackboneMixin
 from .configuration_nat import NatConfig
 
 
diff --git a/src/transformers/models/nezha/__init__.py b/src/transformers/models/deprecated/nezha/__init__.py
similarity index 94%
rename from src/transformers/models/nezha/__init__.py
rename to src/transformers/models/deprecated/nezha/__init__.py
index 5149adf3a0cdfa..590b0013c52d0d 100644
--- a/src/transformers/models/nezha/__init__.py
+++ b/src/transformers/models/deprecated/nezha/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/nezha/configuration_nezha.py b/src/transformers/models/deprecated/nezha/configuration_nezha.py
similarity index 99%
rename from src/transformers/models/nezha/configuration_nezha.py
rename to src/transformers/models/deprecated/nezha/configuration_nezha.py
index 4e145e4b687529..c60bb5de51f476 100644
--- a/src/transformers/models/nezha/configuration_nezha.py
+++ b/src/transformers/models/deprecated/nezha/configuration_nezha.py
@@ -1,4 +1,4 @@
-from ... import PretrainedConfig
+from .... import PretrainedConfig
 
 
 class NezhaConfig(PretrainedConfig):
diff --git a/src/transformers/models/nezha/modeling_nezha.py b/src/transformers/models/deprecated/nezha/modeling_nezha.py
similarity index 99%
rename from src/transformers/models/nezha/modeling_nezha.py
rename to src/transformers/models/deprecated/nezha/modeling_nezha.py
index 30c8b6d890c372..ef20396c00810f 100644
--- a/src/transformers/models/nezha/modeling_nezha.py
+++ b/src/transformers/models/deprecated/nezha/modeling_nezha.py
@@ -25,8 +25,8 @@
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
-from ...activations import ACT2FN
-from ...modeling_outputs import (
+from ....activations import ACT2FN
+from ....modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     BaseModelOutputWithPoolingAndCrossAttentions,
     MaskedLMOutput,
@@ -36,9 +36,9 @@
     SequenceClassifierOutput,
     TokenClassifierOutput,
 )
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import (
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ....utils import (
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
diff --git a/src/transformers/models/qdqbert/__init__.py b/src/transformers/models/deprecated/qdqbert/__init__.py
similarity index 96%
rename from src/transformers/models/qdqbert/__init__.py
rename to src/transformers/models/deprecated/qdqbert/__init__.py
index d413aefe0c7c5a..06e69cdc1fd567 100644
--- a/src/transformers/models/qdqbert/__init__.py
+++ b/src/transformers/models/deprecated/qdqbert/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
 _import_structure = {"configuration_qdqbert": ["QDQBertConfig"]}
diff --git a/src/transformers/models/qdqbert/configuration_qdqbert.py b/src/transformers/models/deprecated/qdqbert/configuration_qdqbert.py
similarity index 98%
rename from src/transformers/models/qdqbert/configuration_qdqbert.py
rename to src/transformers/models/deprecated/qdqbert/configuration_qdqbert.py
index 9f1fdfe31dbb1d..b2ba629b240727 100644
--- a/src/transformers/models/qdqbert/configuration_qdqbert.py
+++ b/src/transformers/models/deprecated/qdqbert/configuration_qdqbert.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 """QDQBERT model configuration"""
 
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/qdqbert/modeling_qdqbert.py b/src/transformers/models/deprecated/qdqbert/modeling_qdqbert.py
similarity index 99%
rename from src/transformers/models/qdqbert/modeling_qdqbert.py
rename to src/transformers/models/deprecated/qdqbert/modeling_qdqbert.py
index 60780618736cb1..f58c9b7fd65946 100755
--- a/src/transformers/models/qdqbert/modeling_qdqbert.py
+++ b/src/transformers/models/deprecated/qdqbert/modeling_qdqbert.py
@@ -25,8 +25,8 @@
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
-from ...activations import ACT2FN
-from ...modeling_outputs import (
+from ....activations import ACT2FN
+from ....modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     BaseModelOutputWithPoolingAndCrossAttentions,
     CausalLMOutputWithCrossAttentions,
@@ -37,9 +37,9 @@
     SequenceClassifierOutput,
     TokenClassifierOutput,
 )
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import (
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ....utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
diff --git a/src/transformers/models/realm/__init__.py b/src/transformers/models/deprecated/realm/__init__.py
similarity index 95%
rename from src/transformers/models/realm/__init__.py
rename to src/transformers/models/deprecated/realm/__init__.py
index eea7384673792a..85fe72441fd143 100644
--- a/src/transformers/models/realm/__init__.py
+++ b/src/transformers/models/deprecated/realm/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/deprecated/realm/configuration_realm.py
similarity index 98%
rename from src/transformers/models/realm/configuration_realm.py
rename to src/transformers/models/deprecated/realm/configuration_realm.py
index 7e84f2916de6a7..20fd201d98f121 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/deprecated/realm/configuration_realm.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 """REALM model configuration."""
 
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/deprecated/realm/modeling_realm.py
similarity index 99%
rename from src/transformers/models/realm/modeling_realm.py
rename to src/transformers/models/deprecated/realm/modeling_realm.py
index 7c5c344ae50f89..f41eafe1840585 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/deprecated/realm/modeling_realm.py
@@ -23,16 +23,16 @@
 from torch import nn
 from torch.nn import CrossEntropyLoss
 
-from ...activations import ACT2FN
-from ...modeling_outputs import (
+from ....activations import ACT2FN
+from ....modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     BaseModelOutputWithPoolingAndCrossAttentions,
     MaskedLMOutput,
     ModelOutput,
 )
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ....utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_realm import RealmConfig
 
 
diff --git a/src/transformers/models/realm/retrieval_realm.py b/src/transformers/models/deprecated/realm/retrieval_realm.py
similarity index 99%
rename from src/transformers/models/realm/retrieval_realm.py
rename to src/transformers/models/deprecated/realm/retrieval_realm.py
index c84e7af08f5601..4bfa2106c65ce1 100644
--- a/src/transformers/models/realm/retrieval_realm.py
+++ b/src/transformers/models/deprecated/realm/retrieval_realm.py
@@ -20,8 +20,8 @@
 import numpy as np
 from huggingface_hub import hf_hub_download
 
-from ... import AutoTokenizer
-from ...utils import logging
+from .... import AutoTokenizer
+from ....utils import logging
 
 
 _REALM_BLOCK_RECORDS_FILENAME = "block_records.npy"
diff --git a/src/transformers/models/realm/tokenization_realm.py b/src/transformers/models/deprecated/realm/tokenization_realm.py
similarity index 99%
rename from src/transformers/models/realm/tokenization_realm.py
rename to src/transformers/models/deprecated/realm/tokenization_realm.py
index c4ff7e38a3e552..671405301dff18 100644
--- a/src/transformers/models/realm/tokenization_realm.py
+++ b/src/transformers/models/deprecated/realm/tokenization_realm.py
@@ -19,9 +19,9 @@
 import unicodedata
 from typing import List, Optional, Tuple
 
-from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
-from ...tokenization_utils_base import BatchEncoding
-from ...utils import PaddingStrategy, logging
+from ....tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from ....tokenization_utils_base import BatchEncoding
+from ....utils import PaddingStrategy, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/realm/tokenization_realm_fast.py b/src/transformers/models/deprecated/realm/tokenization_realm_fast.py
similarity index 98%
rename from src/transformers/models/realm/tokenization_realm_fast.py
rename to src/transformers/models/deprecated/realm/tokenization_realm_fast.py
index 7315bf1c250182..cbc4869e549eba 100644
--- a/src/transformers/models/realm/tokenization_realm_fast.py
+++ b/src/transformers/models/deprecated/realm/tokenization_realm_fast.py
@@ -19,9 +19,9 @@
 
 from tokenizers import normalizers
 
-from ...tokenization_utils_base import BatchEncoding
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import PaddingStrategy, logging
+from ....tokenization_utils_base import BatchEncoding
+from ....tokenization_utils_fast import PreTrainedTokenizerFast
+from ....utils import PaddingStrategy, logging
 from .tokenization_realm import RealmTokenizer
 
 
diff --git a/src/transformers/models/speech_to_text_2/__init__.py b/src/transformers/models/deprecated/speech_to_text_2/__init__.py
similarity index 98%
rename from src/transformers/models/speech_to_text_2/__init__.py
rename to src/transformers/models/deprecated/speech_to_text_2/__init__.py
index ab507bc19f85f9..53f806d00c6874 100644
--- a/src/transformers/models/speech_to_text_2/__init__.py
+++ b/src/transformers/models/deprecated/speech_to_text_2/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import (
+from ....utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
     is_sentencepiece_available,
diff --git a/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py b/src/transformers/models/deprecated/speech_to_text_2/configuration_speech_to_text_2.py
similarity index 98%
rename from src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py
rename to src/transformers/models/deprecated/speech_to_text_2/configuration_speech_to_text_2.py
index bcc92a7bd21f4a..d876c4fc3ecfdd 100644
--- a/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py
+++ b/src/transformers/models/deprecated/speech_to_text_2/configuration_speech_to_text_2.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 """Speech2Text model configuration"""
 
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py
similarity index 99%
rename from src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
rename to src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py
index 35305408e6ab3c..6953821648e9d4 100755
--- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
+++ b/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py
@@ -22,11 +22,11 @@
 from torch import nn
 from torch.nn import CrossEntropyLoss
 
-from ...activations import ACT2FN
-from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
-from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
-from ...modeling_utils import PreTrainedModel
-from ...utils import add_start_docstrings, logging, replace_return_docstrings
+from ....activations import ACT2FN
+from ....modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
+from ....modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
+from ....modeling_utils import PreTrainedModel
+from ....utils import add_start_docstrings, logging, replace_return_docstrings
 from .configuration_speech_to_text_2 import Speech2Text2Config
 
 
diff --git a/src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py b/src/transformers/models/deprecated/speech_to_text_2/processing_speech_to_text_2.py
similarity index 98%
rename from src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py
rename to src/transformers/models/deprecated/speech_to_text_2/processing_speech_to_text_2.py
index 9e0881d89d677a..ce8527e4a72edb 100644
--- a/src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py
+++ b/src/transformers/models/deprecated/speech_to_text_2/processing_speech_to_text_2.py
@@ -19,7 +19,7 @@
 import warnings
 from contextlib import contextmanager
 
-from ...processing_utils import ProcessorMixin
+from ....processing_utils import ProcessorMixin
 
 
 class Speech2Text2Processor(ProcessorMixin):
diff --git a/src/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py b/src/transformers/models/deprecated/speech_to_text_2/tokenization_speech_to_text_2.py
similarity index 98%
rename from src/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py
rename to src/transformers/models/deprecated/speech_to_text_2/tokenization_speech_to_text_2.py
index 8d6818356f3f2a..2eefe449151b7f 100644
--- a/src/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py
+++ b/src/transformers/models/deprecated/speech_to_text_2/tokenization_speech_to_text_2.py
@@ -18,8 +18,8 @@
 import os
 from typing import Dict, List, Optional, Tuple
 
-from ...tokenization_utils import PreTrainedTokenizer
-from ...utils import logging
+from ....tokenization_utils import PreTrainedTokenizer
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/tvlt/__init__.py b/src/transformers/models/deprecated/tvlt/__init__.py
similarity index 99%
rename from src/transformers/models/tvlt/__init__.py
rename to src/transformers/models/deprecated/tvlt/__init__.py
index d63bad0a7adc81..0a2f1e39349433 100644
--- a/src/transformers/models/tvlt/__init__.py
+++ b/src/transformers/models/deprecated/tvlt/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import (
+from ....utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
     is_torch_available,
diff --git a/src/transformers/models/tvlt/configuration_tvlt.py b/src/transformers/models/deprecated/tvlt/configuration_tvlt.py
similarity index 99%
rename from src/transformers/models/tvlt/configuration_tvlt.py
rename to src/transformers/models/deprecated/tvlt/configuration_tvlt.py
index 1a1782f68c8995..bc9c133beca3dd 100644
--- a/src/transformers/models/tvlt/configuration_tvlt.py
+++ b/src/transformers/models/deprecated/tvlt/configuration_tvlt.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 """TVLT model configuration"""
 
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/tvlt/feature_extraction_tvlt.py b/src/transformers/models/deprecated/tvlt/feature_extraction_tvlt.py
similarity index 98%
rename from src/transformers/models/tvlt/feature_extraction_tvlt.py
rename to src/transformers/models/deprecated/tvlt/feature_extraction_tvlt.py
index 7dc5e0463138c5..2d41af33e548d3 100644
--- a/src/transformers/models/tvlt/feature_extraction_tvlt.py
+++ b/src/transformers/models/deprecated/tvlt/feature_extraction_tvlt.py
@@ -19,9 +19,9 @@
 
 import numpy as np
 
-from ...audio_utils import mel_filter_bank, spectrogram, window_function
-from ...feature_extraction_sequence_utils import BatchFeature, SequenceFeatureExtractor
-from ...utils import TensorType, logging
+from ....audio_utils import mel_filter_bank, spectrogram, window_function
+from ....feature_extraction_sequence_utils import BatchFeature, SequenceFeatureExtractor
+from ....utils import TensorType, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/tvlt/image_processing_tvlt.py b/src/transformers/models/deprecated/tvlt/image_processing_tvlt.py
similarity index 99%
rename from src/transformers/models/tvlt/image_processing_tvlt.py
rename to src/transformers/models/deprecated/tvlt/image_processing_tvlt.py
index 06576a0f7ef4bd..009f8307d47577 100644
--- a/src/transformers/models/tvlt/image_processing_tvlt.py
+++ b/src/transformers/models/deprecated/tvlt/image_processing_tvlt.py
@@ -18,13 +18,13 @@
 
 import numpy as np
 
-from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import (
+from ....image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ....image_transforms import (
     get_resize_output_image_size,
     resize,
     to_channel_dimension_format,
 )
-from ...image_utils import (
+from ....image_utils import (
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
     ChannelDimension,
@@ -38,7 +38,7 @@
     validate_kwargs,
     validate_preprocess_arguments,
 )
-from ...utils import TensorType, logging
+from ....utils import TensorType, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/tvlt/modeling_tvlt.py b/src/transformers/models/deprecated/tvlt/modeling_tvlt.py
similarity index 99%
rename from src/transformers/models/tvlt/modeling_tvlt.py
rename to src/transformers/models/deprecated/tvlt/modeling_tvlt.py
index d49fef582288d7..ae84a7df195e07 100644
--- a/src/transformers/models/tvlt/modeling_tvlt.py
+++ b/src/transformers/models/deprecated/tvlt/modeling_tvlt.py
@@ -25,11 +25,11 @@
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
-from ...activations import ACT2FN
-from ...modeling_outputs import BaseModelOutput, SequenceClassifierOutput
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import (
+from ....activations import ACT2FN
+from ....modeling_outputs import BaseModelOutput, SequenceClassifierOutput
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ....utils import (
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
diff --git a/src/transformers/models/tvlt/processing_tvlt.py b/src/transformers/models/deprecated/tvlt/processing_tvlt.py
similarity index 98%
rename from src/transformers/models/tvlt/processing_tvlt.py
rename to src/transformers/models/deprecated/tvlt/processing_tvlt.py
index c67a3a8c6d6df0..da9c755b55edc7 100644
--- a/src/transformers/models/tvlt/processing_tvlt.py
+++ b/src/transformers/models/deprecated/tvlt/processing_tvlt.py
@@ -16,7 +16,7 @@
 Processor class for TVLT.
 """
 
-from ...processing_utils import ProcessorMixin
+from ....processing_utils import ProcessorMixin
 
 
 class TvltProcessor(ProcessorMixin):
diff --git a/src/transformers/models/vit_hybrid/__init__.py b/src/transformers/models/deprecated/vit_hybrid/__init__.py
similarity index 94%
rename from src/transformers/models/vit_hybrid/__init__.py
rename to src/transformers/models/deprecated/vit_hybrid/__init__.py
index f87e44449a978e..d0f9c5831d8445 100644
--- a/src/transformers/models/vit_hybrid/__init__.py
+++ b/src/transformers/models/deprecated/vit_hybrid/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
 
 
 _import_structure = {"configuration_vit_hybrid": ["ViTHybridConfig"]}
diff --git a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py b/src/transformers/models/deprecated/vit_hybrid/configuration_vit_hybrid.py
similarity index 97%
rename from src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
rename to src/transformers/models/deprecated/vit_hybrid/configuration_vit_hybrid.py
index 78349af336ef9b..c0e4244a5a2b44 100644
--- a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
+++ b/src/transformers/models/deprecated/vit_hybrid/configuration_vit_hybrid.py
@@ -14,10 +14,10 @@
 # limitations under the License.
 """ViT Hybrid model configuration"""
 
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
-from ..auto.configuration_auto import CONFIG_MAPPING
-from ..bit import BitConfig
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+from ...auto.configuration_auto import CONFIG_MAPPING
+from ...bit import BitConfig
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py b/src/transformers/models/deprecated/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
similarity index 100%
rename from src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
rename to src/transformers/models/deprecated/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
diff --git a/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py b/src/transformers/models/deprecated/vit_hybrid/image_processing_vit_hybrid.py
similarity index 98%
rename from src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
rename to src/transformers/models/deprecated/vit_hybrid/image_processing_vit_hybrid.py
index 4bb3f70b49bb5b..b8db4a7faee144 100644
--- a/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
+++ b/src/transformers/models/deprecated/vit_hybrid/image_processing_vit_hybrid.py
@@ -18,14 +18,14 @@
 
 import numpy as np
 
-from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import (
+from ....image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ....image_transforms import (
     convert_to_rgb,
     get_resize_output_image_size,
     resize,
     to_channel_dimension_format,
 )
-from ...image_utils import (
+from ....image_utils import (
     OPENAI_CLIP_MEAN,
     OPENAI_CLIP_STD,
     ChannelDimension,
@@ -39,7 +39,7 @@
     validate_kwargs,
     validate_preprocess_arguments,
 )
-from ...utils import TensorType, is_vision_available, logging
+from ....utils import TensorType, is_vision_available, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/deprecated/vit_hybrid/modeling_vit_hybrid.py
similarity index 98%
rename from src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
rename to src/transformers/models/deprecated/vit_hybrid/modeling_vit_hybrid.py
index 9c92e592c1832b..9c025d36153982 100644
--- a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
+++ b/src/transformers/models/deprecated/vit_hybrid/modeling_vit_hybrid.py
@@ -23,12 +23,12 @@
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
-from ...activations import ACT2FN
-from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
-from ...utils.backbone_utils import load_backbone
+from ....activations import ACT2FN
+from ....modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ....utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from ....utils.backbone_utils import load_backbone
 from .configuration_vit_hybrid import ViTHybridConfig
 
 
diff --git a/src/transformers/models/xlm_prophetnet/__init__.py b/src/transformers/models/deprecated/xlm_prophetnet/__init__.py
similarity index 95%
rename from src/transformers/models/xlm_prophetnet/__init__.py
rename to src/transformers/models/deprecated/xlm_prophetnet/__init__.py
index d9c24d9b4d2513..850d2958cb49ec 100644
--- a/src/transformers/models/xlm_prophetnet/__init__.py
+++ b/src/transformers/models/deprecated/xlm_prophetnet/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available, is_torch_available
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py b/src/transformers/models/deprecated/xlm_prophetnet/configuration_xlm_prophetnet.py
similarity index 99%
rename from src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py
rename to src/transformers/models/deprecated/xlm_prophetnet/configuration_xlm_prophetnet.py
index 94d38242b64969..5d3f63670f0cc6 100644
--- a/src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py
+++ b/src/transformers/models/deprecated/xlm_prophetnet/configuration_xlm_prophetnet.py
@@ -16,8 +16,8 @@
 
 from typing import Callable, Optional, Union
 
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py b/src/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py
similarity index 99%
rename from src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
rename to src/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py
index 669c21026e66de..68fb70d4f1a640 100644
--- a/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
+++ b/src/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py
@@ -25,10 +25,10 @@
 from torch import Tensor, nn
 from torch.nn import LayerNorm
 
-from ...activations import ACT2FN
-from ...modeling_outputs import BaseModelOutput
-from ...modeling_utils import PreTrainedModel
-from ...utils import (
+from ....activations import ACT2FN
+from ....modeling_outputs import BaseModelOutput
+from ....modeling_utils import PreTrainedModel
+from ....utils import (
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
diff --git a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py b/src/transformers/models/deprecated/xlm_prophetnet/tokenization_xlm_prophetnet.py
similarity index 99%
rename from src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
rename to src/transformers/models/deprecated/xlm_prophetnet/tokenization_xlm_prophetnet.py
index fa65fa5cbfbaf2..87f458001988cb 100644
--- a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
+++ b/src/transformers/models/deprecated/xlm_prophetnet/tokenization_xlm_prophetnet.py
@@ -18,8 +18,8 @@
 from shutil import copyfile
 from typing import Any, Dict, List, Optional, Tuple
 
-from ...tokenization_utils import PreTrainedTokenizer
-from ...utils import logging
+from ....tokenization_utils import PreTrainedTokenizer
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/dinat/modeling_dinat.py b/src/transformers/models/dinat/modeling_dinat.py
index 95d6a60183dc45..18f8725da86133 100644
--- a/src/transformers/models/dinat/modeling_dinat.py
+++ b/src/transformers/models/dinat/modeling_dinat.py
@@ -71,7 +71,6 @@ def natten2dav(*args, **kwargs):
 
 
 @dataclass
-# Copied from transformers.models.nat.modeling_nat.NatEncoderOutput with Nat->Dinat
 class DinatEncoderOutput(ModelOutput):
     """
     Dinat encoder's outputs, with potential hidden states and attentions.
@@ -105,7 +104,6 @@ class DinatEncoderOutput(ModelOutput):
 
 
 @dataclass
-# Copied from transformers.models.nat.modeling_nat.NatModelOutput with Nat->Dinat
 class DinatModelOutput(ModelOutput):
     """
     Dinat model's outputs that also contains a pooling of the last hidden states.
@@ -142,7 +140,6 @@ class DinatModelOutput(ModelOutput):
 
 
 @dataclass
-# Copied from transformers.models.nat.modeling_nat.NatImageClassifierOutput with Nat->Dinat
 class DinatImageClassifierOutput(ModelOutput):
     """
     Dinat outputs for image classification.
@@ -178,7 +175,6 @@ class DinatImageClassifierOutput(ModelOutput):
     reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
 
 
-# Copied from transformers.models.nat.modeling_nat.NatEmbeddings with Nat->Dinat
 class DinatEmbeddings(nn.Module):
     """
     Construct the patch and position embeddings.
@@ -201,7 +197,6 @@ def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tens
         return embeddings
 
 
-# Copied from transformers.models.nat.modeling_nat.NatPatchEmbeddings with Nat->Dinat
 class DinatPatchEmbeddings(nn.Module):
     """
     This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
@@ -238,7 +233,6 @@ def forward(self, pixel_values: Optional[torch.FloatTensor]) -> torch.Tensor:
         return embeddings
 
 
-# Copied from transformers.models.nat.modeling_nat.NatDownsampler with Nat->Dinat
 class DinatDownsampler(nn.Module):
     """
     Convolutional Downsampling Layer.
@@ -321,7 +315,6 @@ def __init__(self, config, dim, num_heads, kernel_size, dilation):
 
         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
 
-    # Copied from transformers.models.nat.modeling_nat.NeighborhoodAttention.transpose_for_scores with Nat->Dinat
     def transpose_for_scores(self, x):
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(new_x_shape)
@@ -361,7 +354,6 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.nat.modeling_nat.NeighborhoodAttentionOutput
 class NeighborhoodAttentionOutput(nn.Module):
     def __init__(self, config, dim):
         super().__init__()
@@ -382,7 +374,6 @@ def __init__(self, config, dim, num_heads, kernel_size, dilation):
         self.output = NeighborhoodAttentionOutput(config, dim)
         self.pruned_heads = set()
 
-    # Copied from transformers.models.nat.modeling_nat.NeighborhoodAttentionModule.prune_heads
     def prune_heads(self, heads):
         if len(heads) == 0:
             return
@@ -401,7 +392,6 @@ def prune_heads(self, heads):
         self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
         self.pruned_heads = self.pruned_heads.union(heads)
 
-    # Copied from transformers.models.nat.modeling_nat.NeighborhoodAttentionModule.forward
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -413,7 +403,6 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.nat.modeling_nat.NatIntermediate with Nat->Dinat
 class DinatIntermediate(nn.Module):
     def __init__(self, config, dim):
         super().__init__()
@@ -429,7 +418,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-# Copied from transformers.models.nat.modeling_nat.NatOutput with Nat->Dinat
 class DinatOutput(nn.Module):
     def __init__(self, config, dim):
         super().__init__()
@@ -539,7 +527,6 @@ def __init__(self, config, dim, depth, num_heads, dilations, drop_path_rate, dow
 
         self.pointing = False
 
-    # Copied from transformers.models.nat.modeling_nat.NatStage.forward
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -582,7 +569,6 @@ def __init__(self, config):
             ]
         )
 
-    # Copied from transformers.models.nat.modeling_nat.NatEncoder.forward with Nat->Dinat
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -687,7 +673,6 @@ def _init_weights(self, module):
     "The bare Dinat Model transformer outputting raw hidden-states without any specific head on top.",
     DINAT_START_DOCSTRING,
 )
-# Copied from transformers.models.nat.modeling_nat.NatModel with Nat->Dinat, NAT->DINAT
 class DinatModel(DinatPreTrainedModel):
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 5ac2a2ccbd5973..0cda4ed7b96349 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -2609,6 +2609,174 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class DetaForObjectDetection(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class DetaModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class DetaPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class EfficientFormerForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class EfficientFormerForImageClassificationWithTeacher(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class EfficientFormerModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class EfficientFormerPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ErnieMForInformationExtraction(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ErnieMForMultipleChoice(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ErnieMForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ErnieMForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ErnieMForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ErnieMModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ErnieMPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GPTSanJapaneseForConditionalGeneration(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GPTSanJapaneseModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GPTSanJapanesePreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GraphormerForGraphClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GraphormerModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GraphormerPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class JukeboxModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class JukeboxPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class JukeboxPrior(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class JukeboxVQVAE(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class MCTCTForCTC(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -2616,69 +2784,343 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class MCTCTModel(metaclass=DummyObject):
+class MCTCTModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MCTCTPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MegaForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MegaForMaskedLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MegaForMultipleChoice(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MegaForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MegaForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MegaForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MegaModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MegaPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MMBTForClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MMBTModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ModalEmbeddings(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class NatBackbone(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class NatForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class NatModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class NatPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class NezhaForMaskedLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class NezhaForMultipleChoice(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class NezhaForNextSentencePrediction(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class NezhaForPreTraining(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class NezhaForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class NezhaForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class NezhaForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class NezhaModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class NezhaPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class OpenLlamaForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class MCTCTPreTrainedModel(metaclass=DummyObject):
+class OpenLlamaForSequenceClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class MMBTForClassification(metaclass=DummyObject):
+class OpenLlamaModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class MMBTModel(metaclass=DummyObject):
+class OpenLlamaPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ModalEmbeddings(metaclass=DummyObject):
+class QDQBertForMaskedLM(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class OpenLlamaForCausalLM(metaclass=DummyObject):
+class QDQBertForMultipleChoice(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class OpenLlamaForSequenceClassification(metaclass=DummyObject):
+class QDQBertForNextSentencePrediction(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class OpenLlamaModel(metaclass=DummyObject):
+class QDQBertForQuestionAnswering(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class OpenLlamaPreTrainedModel(metaclass=DummyObject):
+class QDQBertForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class QDQBertForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class QDQBertLayer(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class QDQBertLMHeadModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class QDQBertModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class QDQBertPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+def load_tf_weights_in_qdqbert(*args, **kwargs):
+    requires_backends(load_tf_weights_in_qdqbert, ["torch"])
+
+
+class RealmEmbedder(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RealmForOpenQA(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RealmKnowledgeAugEncoder(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RealmPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RealmReader(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RealmRetriever(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RealmScorer(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+def load_tf_weights_in_realm(*args, **kwargs):
+    requires_backends(load_tf_weights_in_realm, ["torch"])
+
+
 class RetriBertModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -2693,6 +3135,20 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class Speech2Text2ForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Speech2Text2PreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class TrajectoryTransformerModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -2746,536 +3202,529 @@ def load_tf_weights_in_transfo_xl(*args, **kwargs):
     requires_backends(load_tf_weights_in_transfo_xl, ["torch"])
 
 
-class VanForImageClassification(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class VanModel(metaclass=DummyObject):
+class TvltForAudioVisualClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class VanPreTrainedModel(metaclass=DummyObject):
+class TvltForPreTraining(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DepthAnythingForDepthEstimation(metaclass=DummyObject):
+class TvltModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DepthAnythingPreTrainedModel(metaclass=DummyObject):
+class TvltPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DetaForObjectDetection(metaclass=DummyObject):
+class VanForImageClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DetaModel(metaclass=DummyObject):
+class VanModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DetaPreTrainedModel(metaclass=DummyObject):
+class VanPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DetrForObjectDetection(metaclass=DummyObject):
+class ViTHybridForImageClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DetrForSegmentation(metaclass=DummyObject):
+class ViTHybridModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DetrModel(metaclass=DummyObject):
+class ViTHybridPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DetrPreTrainedModel(metaclass=DummyObject):
+class XLMProphetNetDecoder(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DinatBackbone(metaclass=DummyObject):
+class XLMProphetNetEncoder(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DinatForImageClassification(metaclass=DummyObject):
+class XLMProphetNetForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DinatModel(metaclass=DummyObject):
+class XLMProphetNetForConditionalGeneration(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DinatPreTrainedModel(metaclass=DummyObject):
+class XLMProphetNetModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Dinov2Backbone(metaclass=DummyObject):
+class XLMProphetNetPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Dinov2ForImageClassification(metaclass=DummyObject):
+class DepthAnythingForDepthEstimation(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Dinov2Model(metaclass=DummyObject):
+class DepthAnythingPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Dinov2PreTrainedModel(metaclass=DummyObject):
+class DetrForObjectDetection(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DistilBertForMaskedLM(metaclass=DummyObject):
+class DetrForSegmentation(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DistilBertForMultipleChoice(metaclass=DummyObject):
+class DetrModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DistilBertForQuestionAnswering(metaclass=DummyObject):
+class DetrPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DistilBertForSequenceClassification(metaclass=DummyObject):
+class DinatBackbone(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DistilBertForTokenClassification(metaclass=DummyObject):
+class DinatForImageClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DistilBertModel(metaclass=DummyObject):
+class DinatModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DistilBertPreTrainedModel(metaclass=DummyObject):
+class DinatPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DonutSwinModel(metaclass=DummyObject):
+class Dinov2Backbone(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DonutSwinPreTrainedModel(metaclass=DummyObject):
+class Dinov2ForImageClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DPRContextEncoder(metaclass=DummyObject):
+class Dinov2Model(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DPRPretrainedContextEncoder(metaclass=DummyObject):
+class Dinov2PreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DPRPreTrainedModel(metaclass=DummyObject):
+class DistilBertForMaskedLM(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DPRPretrainedQuestionEncoder(metaclass=DummyObject):
+class DistilBertForMultipleChoice(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DPRPretrainedReader(metaclass=DummyObject):
+class DistilBertForQuestionAnswering(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DPRQuestionEncoder(metaclass=DummyObject):
+class DistilBertForSequenceClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DPRReader(metaclass=DummyObject):
+class DistilBertForTokenClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DPTForDepthEstimation(metaclass=DummyObject):
+class DistilBertModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DPTForSemanticSegmentation(metaclass=DummyObject):
+class DistilBertPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DPTModel(metaclass=DummyObject):
+class DonutSwinModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DPTPreTrainedModel(metaclass=DummyObject):
+class DonutSwinPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class EfficientFormerForImageClassification(metaclass=DummyObject):
+class DPRContextEncoder(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class EfficientFormerForImageClassificationWithTeacher(metaclass=DummyObject):
+class DPRPretrainedContextEncoder(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class EfficientFormerModel(metaclass=DummyObject):
+class DPRPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class EfficientFormerPreTrainedModel(metaclass=DummyObject):
+class DPRPretrainedQuestionEncoder(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class EfficientNetForImageClassification(metaclass=DummyObject):
+class DPRPretrainedReader(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class EfficientNetModel(metaclass=DummyObject):
+class DPRQuestionEncoder(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class EfficientNetPreTrainedModel(metaclass=DummyObject):
+class DPRReader(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ElectraForCausalLM(metaclass=DummyObject):
+class DPTForDepthEstimation(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ElectraForMaskedLM(metaclass=DummyObject):
+class DPTForSemanticSegmentation(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ElectraForMultipleChoice(metaclass=DummyObject):
+class DPTModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ElectraForPreTraining(metaclass=DummyObject):
+class DPTPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ElectraForQuestionAnswering(metaclass=DummyObject):
+class EfficientNetForImageClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ElectraForSequenceClassification(metaclass=DummyObject):
+class EfficientNetModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ElectraForTokenClassification(metaclass=DummyObject):
+class EfficientNetPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ElectraModel(metaclass=DummyObject):
+class ElectraForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ElectraPreTrainedModel(metaclass=DummyObject):
+class ElectraForMaskedLM(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-def load_tf_weights_in_electra(*args, **kwargs):
-    requires_backends(load_tf_weights_in_electra, ["torch"])
-
-
-class EncodecModel(metaclass=DummyObject):
+class ElectraForMultipleChoice(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class EncodecPreTrainedModel(metaclass=DummyObject):
+class ElectraForPreTraining(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class EncoderDecoderModel(metaclass=DummyObject):
+class ElectraForQuestionAnswering(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ErnieForCausalLM(metaclass=DummyObject):
+class ElectraForSequenceClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ErnieForMaskedLM(metaclass=DummyObject):
+class ElectraForTokenClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ErnieForMultipleChoice(metaclass=DummyObject):
+class ElectraModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ErnieForNextSentencePrediction(metaclass=DummyObject):
+class ElectraPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ErnieForPreTraining(metaclass=DummyObject):
+def load_tf_weights_in_electra(*args, **kwargs):
+    requires_backends(load_tf_weights_in_electra, ["torch"])
+
+
+class EncodecModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ErnieForQuestionAnswering(metaclass=DummyObject):
+class EncodecPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ErnieForSequenceClassification(metaclass=DummyObject):
+class EncoderDecoderModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ErnieForTokenClassification(metaclass=DummyObject):
+class ErnieForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ErnieModel(metaclass=DummyObject):
+class ErnieForMaskedLM(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ErniePreTrainedModel(metaclass=DummyObject):
+class ErnieForMultipleChoice(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ErnieMForInformationExtraction(metaclass=DummyObject):
+class ErnieForNextSentencePrediction(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ErnieMForMultipleChoice(metaclass=DummyObject):
+class ErnieForPreTraining(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ErnieMForQuestionAnswering(metaclass=DummyObject):
+class ErnieForQuestionAnswering(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ErnieMForSequenceClassification(metaclass=DummyObject):
+class ErnieForSequenceClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ErnieMForTokenClassification(metaclass=DummyObject):
+class ErnieForTokenClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ErnieMModel(metaclass=DummyObject):
+class ErnieModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ErnieMPreTrainedModel(metaclass=DummyObject):
+class ErniePreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
@@ -4043,48 +4492,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class GPTSanJapaneseForConditionalGeneration(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class GPTSanJapaneseModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class GPTSanJapanesePreTrainedModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class GraphormerForGraphClassification(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class GraphormerModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class GraphormerPreTrainedModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class GroundingDinoForObjectDetection(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -4404,34 +4811,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class JukeboxModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class JukeboxPreTrainedModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class JukeboxPrior(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class JukeboxVQVAE(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class Kosmos2ForConditionalGeneration(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -5125,62 +5504,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class MegaForCausalLM(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class MegaForMaskedLM(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class MegaForMultipleChoice(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class MegaForQuestionAnswering(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class MegaForSequenceClassification(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class MegaForTokenClassification(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class MegaModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class MegaPreTrainedModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class MegatronBertForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -5746,175 +6069,84 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class MusicgenPreTrainedModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class MusicgenProcessor(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class MusicgenMelodyForCausalLM(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class MusicgenMelodyForConditionalGeneration(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class MusicgenMelodyModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class MusicgenMelodyPreTrainedModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class MvpForCausalLM(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class MvpForConditionalGeneration(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class MvpForQuestionAnswering(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class MvpForSequenceClassification(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class MvpModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class MvpPreTrainedModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class NatBackbone(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class NatForImageClassification(metaclass=DummyObject):
+class MusicgenPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class NatModel(metaclass=DummyObject):
+class MusicgenProcessor(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class NatPreTrainedModel(metaclass=DummyObject):
+class MusicgenMelodyForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class NezhaForMaskedLM(metaclass=DummyObject):
+class MusicgenMelodyForConditionalGeneration(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class NezhaForMultipleChoice(metaclass=DummyObject):
+class MusicgenMelodyModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class NezhaForNextSentencePrediction(metaclass=DummyObject):
+class MusicgenMelodyPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class NezhaForPreTraining(metaclass=DummyObject):
+class MvpForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class NezhaForQuestionAnswering(metaclass=DummyObject):
+class MvpForConditionalGeneration(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class NezhaForSequenceClassification(metaclass=DummyObject):
+class MvpForQuestionAnswering(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class NezhaForTokenClassification(metaclass=DummyObject):
+class MvpForSequenceClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class NezhaModel(metaclass=DummyObject):
+class MvpModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class NezhaPreTrainedModel(metaclass=DummyObject):
+class MvpPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
@@ -6716,80 +6948,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class QDQBertForMaskedLM(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class QDQBertForMultipleChoice(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class QDQBertForNextSentencePrediction(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class QDQBertForQuestionAnswering(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class QDQBertForSequenceClassification(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class QDQBertForTokenClassification(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class QDQBertLayer(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class QDQBertLMHeadModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class QDQBertModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class QDQBertPreTrainedModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-def load_tf_weights_in_qdqbert(*args, **kwargs):
-    requires_backends(load_tf_weights_in_qdqbert, ["torch"])
-
-
 class Qwen2ForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -6888,59 +7046,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class RealmEmbedder(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class RealmForOpenQA(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class RealmKnowledgeAugEncoder(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class RealmPreTrainedModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class RealmReader(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class RealmRetriever(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class RealmScorer(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-def load_tf_weights_in_realm(*args, **kwargs):
-    requires_backends(load_tf_weights_in_realm, ["torch"])
-
-
 class RecurrentGemmaForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -7716,20 +7821,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class Speech2Text2ForCausalLM(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class Speech2Text2PreTrainedModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class SpeechT5ForSpeechToSpeech(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -8277,34 +8368,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class TvltForAudioVisualClassification(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class TvltForPreTraining(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class TvltModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class TvltPreTrainedModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class TvpForVideoGrounding(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -8725,27 +8788,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ViTHybridForImageClassification(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class ViTHybridModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class ViTHybridPreTrainedModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class ViTMAEForPreTraining(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -9194,48 +9236,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class XLMProphetNetDecoder(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class XLMProphetNetEncoder(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class XLMProphetNetForCausalLM(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class XLMProphetNetForConditionalGeneration(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class XLMProphetNetModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class XLMProphetNetPreTrainedModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class XLMRobertaForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/src/transformers/utils/dummy_sentencepiece_objects.py b/src/transformers/utils/dummy_sentencepiece_objects.py
index 33ee907a741f18..8977b4f51b6308 100644
--- a/src/transformers/utils/dummy_sentencepiece_objects.py
+++ b/src/transformers/utils/dummy_sentencepiece_objects.py
@@ -72,6 +72,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["sentencepiece"])
 
 
+class XLMProphetNetTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
 class FNetTokenizer(metaclass=DummyObject):
     _backends = ["sentencepiece"]
 
@@ -233,13 +240,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["sentencepiece"])
 
 
-class XLMProphetNetTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
 class XLMRobertaTokenizer(metaclass=DummyObject):
     _backends = ["sentencepiece"]
 
diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py
index 337b0938b3741a..942a7afced4bc3 100644
--- a/src/transformers/utils/dummy_tf_objects.py
+++ b/src/transformers/utils/dummy_tf_objects.py
@@ -1038,168 +1038,168 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFAdaptiveEmbedding(metaclass=DummyObject):
+class TFEfficientFormerForImageClassification(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFTransfoXLForSequenceClassification(metaclass=DummyObject):
+class TFEfficientFormerForImageClassificationWithTeacher(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFTransfoXLLMHeadModel(metaclass=DummyObject):
+class TFEfficientFormerModel(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFTransfoXLMainLayer(metaclass=DummyObject):
+class TFEfficientFormerPreTrainedModel(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFTransfoXLModel(metaclass=DummyObject):
+class TFAdaptiveEmbedding(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFTransfoXLPreTrainedModel(metaclass=DummyObject):
+class TFTransfoXLForSequenceClassification(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFDistilBertForMaskedLM(metaclass=DummyObject):
+class TFTransfoXLLMHeadModel(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFDistilBertForMultipleChoice(metaclass=DummyObject):
+class TFTransfoXLMainLayer(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFDistilBertForQuestionAnswering(metaclass=DummyObject):
+class TFTransfoXLModel(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFDistilBertForSequenceClassification(metaclass=DummyObject):
+class TFTransfoXLPreTrainedModel(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFDistilBertForTokenClassification(metaclass=DummyObject):
+class TFDistilBertForMaskedLM(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFDistilBertMainLayer(metaclass=DummyObject):
+class TFDistilBertForMultipleChoice(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFDistilBertModel(metaclass=DummyObject):
+class TFDistilBertForQuestionAnswering(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFDistilBertPreTrainedModel(metaclass=DummyObject):
+class TFDistilBertForSequenceClassification(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFDPRContextEncoder(metaclass=DummyObject):
+class TFDistilBertForTokenClassification(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFDPRPretrainedContextEncoder(metaclass=DummyObject):
+class TFDistilBertMainLayer(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFDPRPretrainedQuestionEncoder(metaclass=DummyObject):
+class TFDistilBertModel(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFDPRPretrainedReader(metaclass=DummyObject):
+class TFDistilBertPreTrainedModel(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFDPRQuestionEncoder(metaclass=DummyObject):
+class TFDPRContextEncoder(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFDPRReader(metaclass=DummyObject):
+class TFDPRPretrainedContextEncoder(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFEfficientFormerForImageClassification(metaclass=DummyObject):
+class TFDPRPretrainedQuestionEncoder(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFEfficientFormerForImageClassificationWithTeacher(metaclass=DummyObject):
+class TFDPRPretrainedReader(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFEfficientFormerModel(metaclass=DummyObject):
+class TFDPRQuestionEncoder(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFEfficientFormerPreTrainedModel(metaclass=DummyObject):
+class TFDPRReader(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
diff --git a/src/transformers/utils/dummy_tokenizers_objects.py b/src/transformers/utils/dummy_tokenizers_objects.py
index 0b7ddf119d79a1..df83e6fa6478e6 100644
--- a/src/transformers/utils/dummy_tokenizers_objects.py
+++ b/src/transformers/utils/dummy_tokenizers_objects.py
@@ -121,6 +121,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tokenizers"])
 
 
+class RealmTokenizerFast(metaclass=DummyObject):
+    _backends = ["tokenizers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tokenizers"])
+
+
 class RetriBertTokenizerFast(metaclass=DummyObject):
     _backends = ["tokenizers"]
 
@@ -352,13 +359,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tokenizers"])
 
 
-class RealmTokenizerFast(metaclass=DummyObject):
-    _backends = ["tokenizers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["tokenizers"])
-
-
 class ReformerTokenizerFast(metaclass=DummyObject):
     _backends = ["tokenizers"]
 
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index d32778d4b5f681..aae31e9e4dd7f4 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -142,49 +142,63 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class DetrFeatureExtractor(metaclass=DummyObject):
+class EfficientFormerImageProcessor(metaclass=DummyObject):
     _backends = ["vision"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class DetrImageProcessor(metaclass=DummyObject):
+class TvltImageProcessor(metaclass=DummyObject):
     _backends = ["vision"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class DonutFeatureExtractor(metaclass=DummyObject):
+class ViTHybridImageProcessor(metaclass=DummyObject):
     _backends = ["vision"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class DonutImageProcessor(metaclass=DummyObject):
+class DetrFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class DPTFeatureExtractor(metaclass=DummyObject):
+class DetrImageProcessor(metaclass=DummyObject):
     _backends = ["vision"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class DPTImageProcessor(metaclass=DummyObject):
+class DonutFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class EfficientFormerImageProcessor(metaclass=DummyObject):
+class DonutImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
+class DPTFeatureExtractor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
+class DPTImageProcessor(metaclass=DummyObject):
     _backends = ["vision"]
 
     def __init__(self, *args, **kwargs):
@@ -520,13 +534,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class TvltImageProcessor(metaclass=DummyObject):
-    _backends = ["vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["vision"])
-
-
 class TvpImageProcessor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -590,13 +597,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class ViTHybridImageProcessor(metaclass=DummyObject):
-    _backends = ["vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["vision"])
-
-
 class VitMatteImageProcessor(metaclass=DummyObject):
     _backends = ["vision"]
 
diff --git a/tests/models/deta/__init__.py b/tests/models/deta/__init__.py
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/tests/models/deta/test_image_processing_deta.py b/tests/models/deta/test_image_processing_deta.py
deleted file mode 100644
index 3ea5885b0e0921..00000000000000
--- a/tests/models/deta/test_image_processing_deta.py
+++ /dev/null
@@ -1,535 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import pathlib
-import unittest
-
-from transformers.testing_utils import require_torch, require_vision, slow
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import DetaImageProcessor
-
-
-class DetaImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-        do_rescale=True,
-        rescale_factor=1 / 255,
-        do_pad=True,
-    ):
-        # by setting size["longest_edge"] > max_resolution we're effectively not testing this :p
-        size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_pad = do_pad
-
-    def prepare_image_processor_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_rescale": self.do_rescale,
-            "rescale_factor": self.rescale_factor,
-            "do_pad": self.do_pad,
-        }
-
-    def get_expected_values(self, image_inputs, batched=False):
-        """
-        This function computes the expected height and width when providing images to DetaImageProcessor,
-        assuming do_resize is set to True with a scalar size.
-        """
-        if not batched:
-            image = image_inputs[0]
-            if isinstance(image, Image.Image):
-                w, h = image.size
-            else:
-                h, w = image.shape[1], image.shape[2]
-            if w < h:
-                expected_height = int(self.size["shortest_edge"] * h / w)
-                expected_width = self.size["shortest_edge"]
-            elif w > h:
-                expected_height = self.size["shortest_edge"]
-                expected_width = int(self.size["shortest_edge"] * w / h)
-            else:
-                expected_height = self.size["shortest_edge"]
-                expected_width = self.size["shortest_edge"]
-
-        else:
-            expected_values = []
-            for image in image_inputs:
-                expected_height, expected_width = self.get_expected_values([image])
-                expected_values.append((expected_height, expected_width))
-            expected_height = max(expected_values, key=lambda item: item[0])[0]
-            expected_width = max(expected_values, key=lambda item: item[1])[1]
-
-        return expected_height, expected_width
-
-    def expected_output_image_shape(self, images):
-        height, width = self.get_expected_values(images, batched=True)
-        return self.num_channels, height, width
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_torch
-@require_vision
-class DetaImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = DetaImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        self.image_processor_tester = DetaImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "do_rescale"))
-        self.assertTrue(hasattr(image_processing, "do_pad"))
-        self.assertTrue(hasattr(image_processing, "size"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333})
-        self.assertEqual(image_processor.do_pad, True)
-
-    @slow
-    def test_call_pytorch_with_coco_detection_annotations(self):
-        # prepare image and target
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        target = {"image_id": 39769, "annotations": target}
-
-        # encode them
-        image_processing = DetaImageProcessor()
-        encoding = image_processing(images=image, annotations=target, return_tensors="pt")
-
-        # verify pixel values
-        expected_shape = torch.Size([1, 3, 800, 1066])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
-        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
-
-        # verify area
-        expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
-        # verify boxes
-        expected_boxes_shape = torch.Size([6, 4])
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
-        # verify image_id
-        expected_image_id = torch.tensor([39769])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
-        # verify is_crowd
-        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
-        # verify class_labels
-        expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
-        # verify orig_size
-        expected_orig_size = torch.tensor([480, 640])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
-        # verify size
-        expected_size = torch.tensor([800, 1066])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
-
-    @slow
-    def test_call_pytorch_with_coco_panoptic_annotations(self):
-        # prepare image, target and masks_path
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
-
-        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
-
-        # encode them
-        image_processing = DetaImageProcessor(format="coco_panoptic")
-        encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
-
-        # verify pixel values
-        expected_shape = torch.Size([1, 3, 800, 1066])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
-        self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
-
-        # verify area
-        expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
-        # verify boxes
-        expected_boxes_shape = torch.Size([6, 4])
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
-        # verify image_id
-        expected_image_id = torch.tensor([39769])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
-        # verify is_crowd
-        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
-        # verify class_labels
-        expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
-        # verify masks
-        expected_masks_sum = 822873
-        self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
-        # verify orig_size
-        expected_orig_size = torch.tensor([480, 640])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
-        # verify size
-        expected_size = torch.tensor([800, 1066])
-        self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
-
-    @slow
-    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->Deta
-    def test_batched_coco_detection_annotations(self):
-        image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
-
-        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        annotations_0 = {"image_id": 39769, "annotations": target}
-        annotations_1 = {"image_id": 39769, "annotations": target}
-
-        # Adjust the bounding boxes for the resized image
-        w_0, h_0 = image_0.size
-        w_1, h_1 = image_1.size
-        for i in range(len(annotations_1["annotations"])):
-            coords = annotations_1["annotations"][i]["bbox"]
-            new_bbox = [
-                coords[0] * w_1 / w_0,
-                coords[1] * h_1 / h_0,
-                coords[2] * w_1 / w_0,
-                coords[3] * h_1 / h_0,
-            ]
-            annotations_1["annotations"][i]["bbox"] = new_bbox
-
-        images = [image_0, image_1]
-        annotations = [annotations_0, annotations_1]
-
-        image_processing = DetaImageProcessor()
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            return_segmentation_masks=True,
-            return_tensors="pt",  # do_convert_annotations=True
-        )
-
-        # Check the pixel values have been padded
-        postprocessed_height, postprocessed_width = 800, 1066
-        expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        # Check the bounding boxes have been adjusted for padded images
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
-        expected_boxes_0 = torch.tensor(
-            [
-                [0.6879, 0.4609, 0.0755, 0.3691],
-                [0.2118, 0.3359, 0.2601, 0.1566],
-                [0.5011, 0.5000, 0.9979, 1.0000],
-                [0.5010, 0.5020, 0.9979, 0.9959],
-                [0.3284, 0.5944, 0.5884, 0.8112],
-                [0.8394, 0.5445, 0.3213, 0.9110],
-            ]
-        )
-        expected_boxes_1 = torch.tensor(
-            [
-                [0.4130, 0.2765, 0.0453, 0.2215],
-                [0.1272, 0.2016, 0.1561, 0.0940],
-                [0.3757, 0.4933, 0.7488, 0.9865],
-                [0.3759, 0.5002, 0.7492, 0.9955],
-                [0.1971, 0.5456, 0.3532, 0.8646],
-                [0.5790, 0.4115, 0.3430, 0.7161],
-            ]
-        )
-        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
-        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
-
-        # Check the masks have also been padded
-        self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
-        self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
-
-        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
-        # format and not in the range [0, 1]
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            return_segmentation_masks=True,
-            do_convert_annotations=False,
-            return_tensors="pt",
-        )
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
-        # Convert to absolute coordinates
-        unnormalized_boxes_0 = torch.vstack(
-            [
-                expected_boxes_0[:, 0] * postprocessed_width,
-                expected_boxes_0[:, 1] * postprocessed_height,
-                expected_boxes_0[:, 2] * postprocessed_width,
-                expected_boxes_0[:, 3] * postprocessed_height,
-            ]
-        ).T
-        unnormalized_boxes_1 = torch.vstack(
-            [
-                expected_boxes_1[:, 0] * postprocessed_width,
-                expected_boxes_1[:, 1] * postprocessed_height,
-                expected_boxes_1[:, 2] * postprocessed_width,
-                expected_boxes_1[:, 3] * postprocessed_height,
-            ]
-        ).T
-        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
-        expected_boxes_0 = torch.vstack(
-            [
-                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
-                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
-            ]
-        ).T
-        expected_boxes_1 = torch.vstack(
-            [
-                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
-                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
-            ]
-        ).T
-        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
-        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
-
-    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->Deta
-    def test_batched_coco_panoptic_annotations(self):
-        # prepare image, target and masks_path
-        image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
-
-        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
-        annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
-
-        w_0, h_0 = image_0.size
-        w_1, h_1 = image_1.size
-        for i in range(len(annotation_1["segments_info"])):
-            coords = annotation_1["segments_info"][i]["bbox"]
-            new_bbox = [
-                coords[0] * w_1 / w_0,
-                coords[1] * h_1 / h_0,
-                coords[2] * w_1 / w_0,
-                coords[3] * h_1 / h_0,
-            ]
-            annotation_1["segments_info"][i]["bbox"] = new_bbox
-
-        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
-
-        images = [image_0, image_1]
-        annotations = [annotation_0, annotation_1]
-
-        # encode them
-        image_processing = DetaImageProcessor(format="coco_panoptic")
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            masks_path=masks_path,
-            return_tensors="pt",
-            return_segmentation_masks=True,
-        )
-
-        # Check the pixel values have been padded
-        postprocessed_height, postprocessed_width = 800, 1066
-        expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        # Check the bounding boxes have been adjusted for padded images
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
-        expected_boxes_0 = torch.tensor(
-            [
-                [0.2625, 0.5437, 0.4688, 0.8625],
-                [0.7719, 0.4104, 0.4531, 0.7125],
-                [0.5000, 0.4927, 0.9969, 0.9854],
-                [0.1688, 0.2000, 0.2063, 0.0917],
-                [0.5492, 0.2760, 0.0578, 0.2187],
-                [0.4992, 0.4990, 0.9984, 0.9979],
-            ]
-        )
-        expected_boxes_1 = torch.tensor(
-            [
-                [0.1576, 0.3262, 0.2814, 0.5175],
-                [0.4634, 0.2463, 0.2720, 0.4275],
-                [0.3002, 0.2956, 0.5985, 0.5913],
-                [0.1013, 0.1200, 0.1238, 0.0550],
-                [0.3297, 0.1656, 0.0347, 0.1312],
-                [0.2997, 0.2994, 0.5994, 0.5987],
-            ]
-        )
-        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
-        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
-
-        # Check the masks have also been padded
-        self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
-        self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
-
-        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
-        # format and not in the range [0, 1]
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            masks_path=masks_path,
-            return_segmentation_masks=True,
-            do_convert_annotations=False,
-            return_tensors="pt",
-        )
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
-        # Convert to absolute coordinates
-        unnormalized_boxes_0 = torch.vstack(
-            [
-                expected_boxes_0[:, 0] * postprocessed_width,
-                expected_boxes_0[:, 1] * postprocessed_height,
-                expected_boxes_0[:, 2] * postprocessed_width,
-                expected_boxes_0[:, 3] * postprocessed_height,
-            ]
-        ).T
-        unnormalized_boxes_1 = torch.vstack(
-            [
-                expected_boxes_1[:, 0] * postprocessed_width,
-                expected_boxes_1[:, 1] * postprocessed_height,
-                expected_boxes_1[:, 2] * postprocessed_width,
-                expected_boxes_1[:, 3] * postprocessed_height,
-            ]
-        ).T
-        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
-        expected_boxes_0 = torch.vstack(
-            [
-                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
-                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
-            ]
-        ).T
-        expected_boxes_1 = torch.vstack(
-            [
-                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
-                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
-            ]
-        ).T
-        self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
-        self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
-
-    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->Deta
-    def test_max_width_max_height_resizing_and_pad_strategy(self):
-        image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
-
-        # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
-        image_processor = DetaImageProcessor(
-            size={"max_height": 100, "max_width": 100},
-            do_pad=False,
-        )
-        inputs = image_processor(images=[image_1], return_tensors="pt")
-        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
-
-        # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
-        image_processor = DetaImageProcessor(
-            size={"max_height": 300, "max_width": 100},
-            do_pad=False,
-        )
-        inputs = image_processor(images=[image_1], return_tensors="pt")
-
-        # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
-        image_processor = DetaImageProcessor(
-            size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
-        )
-        inputs = image_processor(images=[image_1], return_tensors="pt")
-        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
-
-        # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
-        image_processor = DetaImageProcessor(
-            size={"max_height": 300, "max_width": 100},
-            do_pad=True,
-            pad_size={"height": 301, "width": 101},
-        )
-        inputs = image_processor(images=[image_1], return_tensors="pt")
-        self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
-
-        ### Check for batch
-        image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
-
-        # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
-        image_processor = DetaImageProcessor(
-            size={"max_height": 150, "max_width": 100},
-            do_pad=True,
-            pad_size={"height": 150, "width": 100},
-        )
-        inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
-        self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
diff --git a/tests/models/deta/test_modeling_deta.py b/tests/models/deta/test_modeling_deta.py
deleted file mode 100644
index fa840212a5d62d..00000000000000
--- a/tests/models/deta/test_modeling_deta.py
+++ /dev/null
@@ -1,671 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch DETA model."""
-
-import collections
-import inspect
-import math
-import re
-import unittest
-
-from transformers import DetaConfig, ResNetConfig, is_torch_available, is_torchvision_available, is_vision_available
-from transformers.file_utils import cached_property
-from transformers.testing_utils import require_torchvision, require_vision, slow, torch_device
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-
-    from transformers.pytorch_utils import id_tensor_storage
-
-if is_torchvision_available():
-    from transformers import DetaForObjectDetection, DetaModel
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import AutoImageProcessor
-
-
-class DetaModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=8,
-        is_training=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=8,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        num_queries=12,
-        two_stage_num_proposals=12,
-        num_channels=3,
-        image_size=224,
-        n_targets=8,
-        num_labels=91,
-        num_feature_levels=4,
-        encoder_n_points=2,
-        decoder_n_points=6,
-        two_stage=True,
-        assign_first_stage=True,
-        assign_second_stage=True,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.num_queries = num_queries
-        self.two_stage_num_proposals = two_stage_num_proposals
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.n_targets = n_targets
-        self.num_labels = num_labels
-        self.num_feature_levels = num_feature_levels
-        self.encoder_n_points = encoder_n_points
-        self.decoder_n_points = decoder_n_points
-        self.two_stage = two_stage
-        self.assign_first_stage = assign_first_stage
-        self.assign_second_stage = assign_second_stage
-
-        # we also set the expected seq length for both encoder and decoder
-        self.encoder_seq_length = (
-            math.ceil(self.image_size / 8) ** 2
-            + math.ceil(self.image_size / 16) ** 2
-            + math.ceil(self.image_size / 32) ** 2
-            + math.ceil(self.image_size / 64) ** 2
-        )
-        self.decoder_seq_length = self.num_queries
-
-    def prepare_config_and_inputs(self, model_class_name):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        pixel_mask = torch.ones([self.batch_size, self.image_size, self.image_size], device=torch_device)
-
-        labels = None
-        if self.use_labels:
-            # labels is a list of Dict (each Dict being the labels for a given example in the batch)
-            labels = []
-            for i in range(self.batch_size):
-                target = {}
-                target["class_labels"] = torch.randint(
-                    high=self.num_labels, size=(self.n_targets,), device=torch_device
-                )
-                target["boxes"] = torch.rand(self.n_targets, 4, device=torch_device)
-                target["masks"] = torch.rand(self.n_targets, self.image_size, self.image_size, device=torch_device)
-                labels.append(target)
-
-        config = self.get_config(model_class_name)
-        return config, pixel_values, pixel_mask, labels
-
-    def get_config(self, model_class_name):
-        resnet_config = ResNetConfig(
-            num_channels=3,
-            embeddings_size=10,
-            hidden_sizes=[10, 20, 30, 40],
-            depths=[1, 1, 2, 1],
-            hidden_act="relu",
-            num_labels=3,
-            out_features=["stage2", "stage3", "stage4"],
-            out_indices=[2, 3, 4],
-        )
-        two_stage = model_class_name == "DetaForObjectDetection"
-        assign_first_stage = model_class_name == "DetaForObjectDetection"
-        assign_second_stage = model_class_name == "DetaForObjectDetection"
-        return DetaConfig(
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            num_queries=self.num_queries,
-            two_stage_num_proposals=self.two_stage_num_proposals,
-            num_labels=self.num_labels,
-            num_feature_levels=self.num_feature_levels,
-            encoder_n_points=self.encoder_n_points,
-            decoder_n_points=self.decoder_n_points,
-            two_stage=two_stage,
-            assign_first_stage=assign_first_stage,
-            assign_second_stage=assign_second_stage,
-            backbone_config=resnet_config,
-            backbone=None,
-        )
-
-    def prepare_config_and_inputs_for_common(self, model_class_name="DetaModel"):
-        config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs(model_class_name)
-        inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask}
-        return config, inputs_dict
-
-    def create_and_check_deta_model(self, config, pixel_values, pixel_mask, labels):
-        model = DetaModel(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
-        result = model(pixel_values)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_queries, self.hidden_size))
-
-    def create_and_check_deta_freeze_backbone(self, config, pixel_values, pixel_mask, labels):
-        model = DetaModel(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        model.freeze_backbone()
-
-        for _, param in model.backbone.model.named_parameters():
-            self.parent.assertEqual(False, param.requires_grad)
-
-    def create_and_check_deta_unfreeze_backbone(self, config, pixel_values, pixel_mask, labels):
-        model = DetaModel(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        model.unfreeze_backbone()
-
-        for _, param in model.backbone.model.named_parameters():
-            self.parent.assertEqual(True, param.requires_grad)
-
-    def create_and_check_deta_object_detection_head_model(self, config, pixel_values, pixel_mask, labels):
-        model = DetaForObjectDetection(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
-        result = model(pixel_values)
-
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.two_stage_num_proposals, self.num_labels))
-        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.two_stage_num_proposals, 4))
-
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
-
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.two_stage_num_proposals, self.num_labels))
-        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.two_stage_num_proposals, 4))
-
-
-@require_torchvision
-class DetaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (DetaModel, DetaForObjectDetection) if is_torchvision_available() else ()
-    pipeline_model_mapping = (
-        {"image-feature-extraction": DetaModel, "object-detection": DetaForObjectDetection}
-        if is_torchvision_available()
-        else {}
-    )
-    is_encoder_decoder = True
-    test_torchscript = False
-    test_pruning = False
-    test_head_masking = False
-    test_missing_keys = False
-
-    # TODO: Fix the failed tests when this model gets more usage
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if pipeline_test_casse_name == "ObjectDetectionPipelineTests":
-            return True
-
-        return False
-
-    @unittest.skip("Skip for now. PR #22437 causes some loading issue. See (not merged) #22656 for some discussions.")
-    def test_can_use_safetensors(self):
-        super().test_can_use_safetensors()
-
-    # special case for head models
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class.__name__ == "DetaForObjectDetection":
-                labels = []
-                for i in range(self.model_tester.batch_size):
-                    target = {}
-                    target["class_labels"] = torch.ones(
-                        size=(self.model_tester.n_targets,), device=torch_device, dtype=torch.long
-                    )
-                    target["boxes"] = torch.ones(
-                        self.model_tester.n_targets, 4, device=torch_device, dtype=torch.float
-                    )
-                    target["masks"] = torch.ones(
-                        self.model_tester.n_targets,
-                        self.model_tester.image_size,
-                        self.model_tester.image_size,
-                        device=torch_device,
-                        dtype=torch.float,
-                    )
-                    labels.append(target)
-                inputs_dict["labels"] = labels
-
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = DetaModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=DetaConfig, has_text_modality=False)
-
-    def test_config(self):
-        # we don't test common_properties and arguments_init as these don't apply for DETA
-        self.config_tester.create_and_test_config_to_json_string()
-        self.config_tester.create_and_test_config_to_json_file()
-        self.config_tester.create_and_test_config_from_and_save_pretrained()
-        self.config_tester.create_and_test_config_with_num_labels()
-        self.config_tester.check_config_can_be_init_without_params()
-
-    def test_deta_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(model_class_name="DetaModel")
-        self.model_tester.create_and_check_deta_model(*config_and_inputs)
-
-    def test_deta_freeze_backbone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(model_class_name="DetaModel")
-        self.model_tester.create_and_check_deta_freeze_backbone(*config_and_inputs)
-
-    def test_deta_unfreeze_backbone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(model_class_name="DetaModel")
-        self.model_tester.create_and_check_deta_unfreeze_backbone(*config_and_inputs)
-
-    def test_deta_object_detection_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(model_class_name="DetaForObjectDetection")
-        self.model_tester.create_and_check_deta_object_detection_head_model(*config_and_inputs)
-
-    @unittest.skip(reason="DETA does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="DETA does not use inputs_embeds")
-    def test_inputs_embeds_matches_input_ids(self):
-        pass
-
-    @unittest.skip(reason="DETA does not have a get_input_embeddings method")
-    def test_model_common_attributes(self):
-        pass
-
-    @unittest.skip(reason="DETA is not a generative model")
-    def test_generate_without_input_ids(self):
-        pass
-
-    @unittest.skip(reason="DETA does not use token embeddings")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    @unittest.skip(reason="Feed forward chunking is not implemented")
-    def test_feed_forward_chunking(self):
-        pass
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    self.model_tester.num_feature_levels,
-                    self.model_tester.encoder_n_points,
-                ],
-            )
-            out_len = len(outputs)
-
-            correct_outlen = 8
-
-            # loss is at first position
-            if "labels" in inputs_dict:
-                correct_outlen += 1  # loss is added to beginning
-            # Object Detection model returns pred_logits and pred_boxes
-            if model_class.__name__ == "DetaForObjectDetection":
-                correct_outlen += 2
-
-            self.assertEqual(out_len, correct_outlen)
-
-            # decoder attentions
-            decoder_attentions = outputs.decoder_attentions
-            self.assertIsInstance(decoder_attentions, (list, tuple))
-            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(decoder_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, self.model_tester.num_queries, self.model_tester.num_queries],
-            )
-
-            # cross attentions
-            cross_attentions = outputs.cross_attentions
-            self.assertIsInstance(cross_attentions, (list, tuple))
-            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(cross_attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    self.model_tester.num_feature_levels,
-                    self.model_tester.decoder_n_points,
-                ],
-            )
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            if hasattr(self.model_tester, "num_hidden_states_types"):
-                added_hidden_states = self.model_tester.num_hidden_states_types
-            elif self.is_encoder_decoder:
-                added_hidden_states = 2
-            else:
-                added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.encoder_attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    self.model_tester.num_feature_levels,
-                    self.model_tester.encoder_n_points,
-                ],
-            )
-
-    # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-        model.to(torch_device)
-
-        inputs = self._prepare_for_class(inputs_dict, model_class)
-
-        outputs = model(**inputs)
-
-        # we take the second output since last_hidden_state is the second item
-        output = outputs[1]
-
-        encoder_hidden_states = outputs.encoder_hidden_states[0]
-        encoder_attentions = outputs.encoder_attentions[0]
-        encoder_hidden_states.retain_grad()
-        encoder_attentions.retain_grad()
-
-        decoder_attentions = outputs.decoder_attentions[0]
-        decoder_attentions.retain_grad()
-
-        cross_attentions = outputs.cross_attentions[0]
-        cross_attentions.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(encoder_hidden_states.grad)
-        self.assertIsNotNone(encoder_attentions.grad)
-        self.assertIsNotNone(decoder_attentions.grad)
-        self.assertIsNotNone(cross_attentions.grad)
-
-    def test_forward_auxiliary_loss(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.auxiliary_loss = True
-
-        # only test for object detection and segmentation model
-        for model_class in self.all_model_classes[1:]:
-            model = model_class(config)
-            model.to(torch_device)
-
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-
-            outputs = model(**inputs)
-
-            self.assertIsNotNone(outputs.auxiliary_outputs)
-            self.assertEqual(len(outputs.auxiliary_outputs), self.model_tester.num_hidden_layers - 1)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            if model.config.is_encoder_decoder:
-                expected_arg_names = ["pixel_values", "pixel_mask"]
-                expected_arg_names.extend(
-                    ["head_mask", "decoder_head_mask", "encoder_outputs"]
-                    if "head_mask" and "decoder_head_mask" in arg_names
-                    else []
-                )
-                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-            else:
-                expected_arg_names = ["pixel_values", "pixel_mask"]
-                self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    @unittest.skip(reason="Model doesn't use tied weights")
-    def test_tied_model_weights_key_ignore(self):
-        pass
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            # Skip the check for the backbone
-            for name, module in model.named_modules():
-                if module.__class__.__name__ == "DetaBackboneWithPositionalEncodings":
-                    backbone_params = [f"{name}.{key}" for key in module.state_dict().keys()]
-                    break
-
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if (
-                        "level_embed" in name
-                        or "sampling_offsets.bias" in name
-                        or "value_proj" in name
-                        or "output_proj" in name
-                        or "reference_points" in name
-                        or name in backbone_params
-                    ):
-                        continue
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage(self):
-        pass
-
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_checkpoints(self):
-        pass
-
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_no_safetensors(self):
-        pass
-
-    # Inspired by tests.test_modeling_common.ModelTesterMixin.test_tied_weights_keys
-    def test_tied_weights_keys(self):
-        for model_class in self.all_model_classes:
-            # We need to pass model class name to correctly initialize the config.
-            # If we don't pass it, the config for `DetaForObjectDetection`` will be initialized
-            # with `two_stage=False` and the test will fail because for that case `class_embed`
-            # weights are not tied.
-            config, _ = self.model_tester.prepare_config_and_inputs_for_common(model_class_name=model_class.__name__)
-            config.tie_word_embeddings = True
-
-            model_tied = model_class(config)
-
-            ptrs = collections.defaultdict(list)
-            for name, tensor in model_tied.state_dict().items():
-                ptrs[id_tensor_storage(tensor)].append(name)
-
-            # These are all the pointers of shared tensors.
-            tied_params = [names for _, names in ptrs.items() if len(names) > 1]
-
-            tied_weight_keys = model_tied._tied_weights_keys if model_tied._tied_weights_keys is not None else []
-            # Detect we get a hit for each key
-            for key in tied_weight_keys:
-                is_tied_key = any(re.search(key, p) for group in tied_params for p in group)
-                self.assertTrue(is_tied_key, f"{key} is not a tied weight key for {model_class}.")
-
-            # Removed tied weights found from tied params -> there should only be one left after
-            for key in tied_weight_keys:
-                for i in range(len(tied_params)):
-                    tied_params[i] = [p for p in tied_params[i] if re.search(key, p) is None]
-
-            tied_params = [group for group in tied_params if len(group) > 1]
-            self.assertListEqual(
-                tied_params,
-                [],
-                f"Missing `_tied_weights_keys` for {model_class}: add all of {tied_params} except one.",
-            )
-
-
-TOLERANCE = 1e-4
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_torchvision
-@require_vision
-@slow
-class DetaModelIntegrationTests(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return AutoImageProcessor.from_pretrained("jozhang97/deta-resnet-50") if is_vision_available() else None
-
-    def test_inference_object_detection_head(self):
-        model = DetaForObjectDetection.from_pretrained("jozhang97/deta-resnet-50").to(torch_device)
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
-
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        expected_shape_logits = torch.Size((1, 300, model.config.num_labels))
-        self.assertEqual(outputs.logits.shape, expected_shape_logits)
-
-        expected_logits = torch.tensor(
-            [[-7.3978, -2.5406, -4.1668], [-8.2684, -3.9933, -3.8096], [-7.0515, -3.7973, -5.8516]]
-        ).to(torch_device)
-        expected_boxes = torch.tensor(
-            [[0.5043, 0.4973, 0.9998], [0.2542, 0.5489, 0.4748], [0.5490, 0.2765, 0.0570]]
-        ).to(torch_device)
-
-        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4))
-
-        expected_shape_boxes = torch.Size((1, 300, 4))
-        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
-        self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4))
-
-        # verify postprocessing
-        results = image_processor.post_process_object_detection(
-            outputs, threshold=0.3, target_sizes=[image.size[::-1]]
-        )[0]
-        expected_scores = torch.tensor([0.6392, 0.6276, 0.5546, 0.5260, 0.4706], device=torch_device)
-        expected_labels = [75, 17, 17, 75, 63]
-        expected_slice_boxes = torch.tensor([40.5866, 73.2107, 176.1421, 117.1751], device=torch_device)
-
-        self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-4))
-        self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
-        self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes))
-
-    def test_inference_object_detection_head_swin_backbone(self):
-        model = DetaForObjectDetection.from_pretrained("jozhang97/deta-swin-large").to(torch_device)
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
-
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        expected_shape_logits = torch.Size((1, 300, model.config.num_labels))
-        self.assertEqual(outputs.logits.shape, expected_shape_logits)
-
-        expected_logits = torch.tensor(
-            [[-7.6308, -2.8485, -5.3737], [-7.2037, -4.5505, -4.8027], [-7.2943, -4.2611, -4.6617]]
-        ).to(torch_device)
-        expected_boxes = torch.tensor(
-            [[0.4987, 0.4969, 0.9999], [0.2549, 0.5498, 0.4805], [0.5498, 0.2757, 0.0569]]
-        ).to(torch_device)
-
-        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4))
-
-        expected_shape_boxes = torch.Size((1, 300, 4))
-        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
-        self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4))
-
-        # verify postprocessing
-        results = image_processor.post_process_object_detection(
-            outputs, threshold=0.3, target_sizes=[image.size[::-1]]
-        )[0]
-        expected_scores = torch.tensor([0.6831, 0.6826, 0.5684, 0.5464, 0.4392], device=torch_device)
-        expected_labels = [17, 17, 75, 75, 63]
-        expected_slice_boxes = torch.tensor([345.8478, 23.6754, 639.8562, 372.8265], device=torch_device)
-
-        self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-4))
-        self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
-        self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes))
diff --git a/tests/models/efficientformer/__init__.py b/tests/models/efficientformer/__init__.py
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/tests/models/efficientformer/test_image_processing_efficientformer.py b/tests/models/efficientformer/test_image_processing_efficientformer.py
deleted file mode 100644
index bd91b771061635..00000000000000
--- a/tests/models/efficientformer/test_image_processing_efficientformer.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# coding=utf-8
-# Copyright 2021 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_vision_available
-
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_vision_available():
-    from transformers import ViTImageProcessor
-
-
-class EfficientFormerImageProcessorTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        num_channels=3,
-        image_size=224,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-    ):
-        size = size if size is not None else {"height": 18, "width": 18}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-
-    def prepare_image_processor_dict(self):
-        return {
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_normalize": self.do_normalize,
-            "do_resize": self.do_resize,
-            "size": self.size,
-        }
-
-    def expected_output_image_shape(self, images):
-        return self.num_channels, self.size["height"], self.size["width"]
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_torch
-@require_vision
-class EfficientFormerImageProcessorTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = ViTImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        self.image_processor_tester = EfficientFormerImageProcessorTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_proc_properties(self):
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processor, "image_mean"))
-        self.assertTrue(hasattr(image_processor, "image_std"))
-        self.assertTrue(hasattr(image_processor, "do_normalize"))
-        self.assertTrue(hasattr(image_processor, "do_resize"))
-        self.assertTrue(hasattr(image_processor, "size"))
diff --git a/tests/models/efficientformer/test_modeling_efficientformer.py b/tests/models/efficientformer/test_modeling_efficientformer.py
deleted file mode 100644
index 6b7ce810ce8c0b..00000000000000
--- a/tests/models/efficientformer/test_modeling_efficientformer.py
+++ /dev/null
@@ -1,478 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch EfficientFormer model."""
-
-import unittest
-import warnings
-from typing import List
-
-from transformers import EfficientFormerConfig
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
-from transformers.utils import cached_property, is_torch_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        EfficientFormerForImageClassification,
-        EfficientFormerForImageClassificationWithTeacher,
-        EfficientFormerModel,
-    )
-    from transformers.models.auto.modeling_auto import (
-        MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
-        MODEL_MAPPING_NAMES,
-    )
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import EfficientFormerImageProcessor
-
-
-class EfficientFormerModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size: int = 13,
-        image_size: int = 64,
-        patch_size: int = 2,
-        embed_dim: int = 3,
-        num_channels: int = 3,
-        is_training: bool = True,
-        use_labels: bool = True,
-        hidden_size: int = 128,
-        hidden_sizes=[16, 32, 64, 128],
-        num_hidden_layers: int = 7,
-        num_attention_heads: int = 4,
-        intermediate_size: int = 37,
-        hidden_act: str = "gelu",
-        hidden_dropout_prob: float = 0.1,
-        attention_probs_dropout_prob: float = 0.1,
-        type_sequence_label_size: int = 10,
-        initializer_range: float = 0.02,
-        encoder_stride: int = 2,
-        num_attention_outputs: int = 1,
-        dim: int = 128,
-        depths: List[int] = [2, 2, 2, 2],
-        resolution: int = 2,
-        mlp_expansion_ratio: int = 2,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.encoder_stride = encoder_stride
-        self.num_attention_outputs = num_attention_outputs
-        self.embed_dim = embed_dim
-        self.seq_length = embed_dim + 1
-        self.resolution = resolution
-        self.depths = depths
-        self.hidden_sizes = hidden_sizes
-        self.dim = dim
-        self.mlp_expansion_ratio = mlp_expansion_ratio
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return EfficientFormerConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            encoder_stride=self.encoder_stride,
-            resolution=self.resolution,
-            depths=self.depths,
-            hidden_sizes=self.hidden_sizes,
-            dim=self.dim,
-            mlp_expansion_ratio=self.mlp_expansion_ratio,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = EfficientFormerModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.type_sequence_label_size
-        model = EfficientFormerForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = EfficientFormerForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            pixel_values,
-            labels,
-        ) = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_torch
-class EfficientFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as EfficientFormer does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (
-        (
-            EfficientFormerModel,
-            EfficientFormerForImageClassificationWithTeacher,
-            EfficientFormerForImageClassification,
-        )
-        if is_torch_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "image-feature-extraction": EfficientFormerModel,
-            "image-classification": (
-                EfficientFormerForImageClassification,
-                EfficientFormerForImageClassificationWithTeacher,
-            ),
-        }
-        if is_torch_available()
-        else {}
-    )
-    fx_compatible = False
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = EfficientFormerModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=EfficientFormerConfig, has_text_modality=False, hidden_size=37
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="EfficientFormer does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="EfficientFormer does not support input and output embeddings")
-    def test_model_common_attributes(self):
-        pass
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            if hasattr(self.model_tester, "encoder_seq_length"):
-                seq_length = self.model_tester.encoder_seq_length
-                if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
-                    seq_length = seq_length * self.model_tester.chunk_length
-            else:
-                seq_length = self.model_tester.seq_length
-
-            self.assertListEqual(
-                list(hidden_states[-1].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-            if config.is_encoder_decoder:
-                hidden_states = outputs.decoder_hidden_states
-
-                self.assertIsInstance(hidden_states, (list, tuple))
-                self.assertEqual(len(hidden_states), expected_num_layers)
-                seq_len = getattr(self.model_tester, "seq_length", None)
-                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-
-                self.assertListEqual(
-                    list(hidden_states[-1].shape[-2:]),
-                    [decoder_seq_length, self.model_tester.hidden_size],
-                )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class.__name__ == "EfficientFormerForImageClassificationWithTeacher":
-                del inputs_dict["labels"]
-
-        return inputs_dict
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="EfficientFormer does not implement masked image modeling yet")
-    def test_for_masked_image_modeling(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    # special case for EfficientFormerForImageClassificationWithTeacher model
-    def test_training(self):
-        if not self.model_tester.is_training:
-            return
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        for model_class in self.all_model_classes:
-            # EfficientFormerForImageClassificationWithTeacher supports inference-only
-            if (
-                model_class.__name__ in MODEL_MAPPING_NAMES.values()
-                or model_class.__name__ == "EfficientFormerForImageClassificationWithTeacher"
-            ):
-                continue
-            model = model_class(config)
-            model.to(torch_device)
-            model.train()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            loss = model(**inputs).loss
-            loss.backward()
-
-    def test_problem_types(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        problem_types = [
-            {"title": "multi_label_classification", "num_labels": 2, "dtype": torch.float},
-            {"title": "single_label_classification", "num_labels": 1, "dtype": torch.long},
-            {"title": "regression", "num_labels": 1, "dtype": torch.float},
-        ]
-
-        for model_class in self.all_model_classes:
-            if (
-                model_class.__name__
-                not in [
-                    *MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES.values(),
-                ]
-                or model_class.__name__ == "EfficientFormerForImageClassificationWithTeacher"
-            ):
-                continue
-
-            for problem_type in problem_types:
-                with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"):
-                    config.problem_type = problem_type["title"]
-                    config.num_labels = problem_type["num_labels"]
-
-                    model = model_class(config)
-                    model.to(torch_device)
-                    model.train()
-
-                    inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-
-                    if problem_type["num_labels"] > 1:
-                        inputs["labels"] = inputs["labels"].unsqueeze(1).repeat(1, problem_type["num_labels"])
-
-                    inputs["labels"] = inputs["labels"].to(problem_type["dtype"])
-
-                    # This tests that we do not trigger the warning form PyTorch "Using a target size that is different
-                    # to the input size. This will likely lead to incorrect results due to broadcasting. Please ensure
-                    # they have the same size." which is a symptom something in wrong for the regression problem.
-                    # See https://github.com/huggingface/transformers/issues/11780
-                    with warnings.catch_warnings(record=True) as warning_list:
-                        loss = model(**inputs).loss
-                    for w in warning_list:
-                        if "Using a target size that is different to the input size" in str(w.message):
-                            raise ValueError(
-                                f"Something is going wrong in the regression problem: intercepted {w.message}"
-                            )
-
-                    loss.backward()
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "snap-research/efficientformer-l1-300"
-        model = EfficientFormerModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-        chunk_length = getattr(self.model_tester, "chunk_length", None)
-
-        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
-            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_attention_outputs)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_attention_outputs)
-
-            if chunk_length is not None:
-                self.assertListEqual(
-                    list(attentions[0].shape[-4:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
-                )
-            else:
-                self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                )
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_torch
-@require_vision
-class EfficientFormerModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return (
-            EfficientFormerImageProcessor.from_pretrained("snap-research/efficientformer-l1-300")
-            if is_vision_available()
-            else None
-        )
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = EfficientFormerForImageClassification.from_pretrained("snap-research/efficientformer-l1-300").to(
-            torch_device
-        )
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 1000)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = torch.tensor([-0.0555, 0.4825, -0.0852]).to(torch_device)
-        self.assertTrue(torch.allclose(outputs.logits[0][:3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_image_classification_head_with_teacher(self):
-        model = EfficientFormerForImageClassificationWithTeacher.from_pretrained(
-            "snap-research/efficientformer-l1-300"
-        ).to(torch_device)
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 1000)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = torch.tensor([-0.1312, 0.4353, -1.0499]).to(torch_device)
-        self.assertTrue(torch.allclose(outputs.logits[0][:3], expected_slice, atol=1e-4))
diff --git a/tests/models/efficientformer/test_modeling_tf_efficientformer.py b/tests/models/efficientformer/test_modeling_tf_efficientformer.py
deleted file mode 100644
index abb08787408b67..00000000000000
--- a/tests/models/efficientformer/test_modeling_tf_efficientformer.py
+++ /dev/null
@@ -1,409 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the TensorFlow EfficientFormer model."""
-
-import inspect
-import unittest
-from typing import List
-
-import numpy as np
-
-from transformers import EfficientFormerConfig
-from transformers.testing_utils import require_tf, require_vision, slow
-from transformers.utils import cached_property, is_tf_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import (
-        TFEfficientFormerForImageClassification,
-        TFEfficientFormerForImageClassificationWithTeacher,
-        TFEfficientFormerModel,
-    )
-    from transformers.modeling_tf_utils import keras
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import EfficientFormerImageProcessor
-
-
-class TFEfficientFormerModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size: int = 13,
-        image_size: int = 64,
-        patch_size: int = 2,
-        embed_dim: int = 3,
-        num_channels: int = 3,
-        is_training: bool = True,
-        use_labels: bool = True,
-        hidden_size: int = 128,
-        hidden_sizes=[16, 32, 64, 128],
-        num_hidden_layers: int = 7,
-        num_attention_heads: int = 4,
-        intermediate_size: int = 37,
-        hidden_act: str = "gelu",
-        hidden_dropout_prob: float = 0.1,
-        attention_probs_dropout_prob: float = 0.1,
-        type_sequence_label_size: int = 10,
-        initializer_range: float = 0.02,
-        encoder_stride: int = 2,
-        num_attention_outputs: int = 1,
-        dim: int = 128,
-        depths: List[int] = [2, 2, 2, 2],
-        resolution: int = 2,
-        mlp_expansion_ratio: int = 2,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.encoder_stride = encoder_stride
-        self.num_attention_outputs = num_attention_outputs
-        self.embed_dim = embed_dim
-        self.seq_length = embed_dim + 1
-        self.resolution = resolution
-        self.depths = depths
-        self.hidden_sizes = hidden_sizes
-        self.dim = dim
-        self.mlp_expansion_ratio = mlp_expansion_ratio
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return EfficientFormerConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            encoder_stride=self.encoder_stride,
-            resolution=self.resolution,
-            depths=self.depths,
-            hidden_sizes=self.hidden_sizes,
-            dim=self.dim,
-            mlp_expansion_ratio=self.mlp_expansion_ratio,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = TFEfficientFormerModel(config=config)
-        result = model(pixel_values, training=False)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.type_sequence_label_size
-        model = TFEfficientFormerForImageClassification(config)
-        result = model(pixel_values, labels=labels, training=False)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = TFEfficientFormerForImageClassification(config)
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_tf
-class TFEfficientFormerModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_tf_common.py, as EfficientFormer does not use input_ids,
-    inputs_embeds, attention_mask and seq_length.
-    """
-
-    all_model_classes = (
-        (
-            TFEfficientFormerModel,
-            TFEfficientFormerForImageClassificationWithTeacher,
-            TFEfficientFormerForImageClassification,
-        )
-        if is_tf_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFEfficientFormerModel,
-            "image-classification": (
-                TFEfficientFormerForImageClassification,
-                TFEfficientFormerForImageClassificationWithTeacher,
-            ),
-        }
-        if is_tf_available()
-        else {}
-    )
-
-    fx_compatible = False
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    test_onnx = False
-
-    def setUp(self):
-        self.model_tester = TFEfficientFormerModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=EfficientFormerConfig, has_text_modality=False, hidden_size=37
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="EfficientFormer does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="EfficientFormer does not support input and output embeddings")
-    def test_model_common_attributes(self):
-        pass
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.call)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            if hasattr(self.model_tester, "encoder_seq_length"):
-                seq_length = self.model_tester.encoder_seq_length
-                if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
-                    seq_length = seq_length * self.model_tester.chunk_length
-            else:
-                seq_length = self.model_tester.seq_length
-
-            self.assertListEqual(
-                list(hidden_states[-1].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-            if config.is_encoder_decoder:
-                hidden_states = outputs.decoder_hidden_states
-
-                self.asseretIsInstance(hidden_states, (list, tuple))
-                self.assertEqual(len(hidden_states), expected_num_layers)
-                seq_len = getattr(self.model_tester, "seq_length", None)
-                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-
-                self.assertListEqual(
-                    list(hidden_states[-1].shape[-2:]),
-                    [decoder_seq_length, self.model_tester.hidden_size],
-                )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class.__name__ == "TFEfficientFormerForImageClassificationWithTeacher":
-                del inputs_dict["labels"]
-
-        return inputs_dict
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="EfficientFormer does not implement masked image modeling yet")
-    def test_for_masked_image_modeling(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "snap-research/efficientformer-l1-300"
-        model = TFEfficientFormerModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-        chunk_length = getattr(self.model_tester, "chunk_length", None)
-
-        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
-            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_attention_outputs)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
-
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_attention_outputs)
-
-            if chunk_length is not None:
-                self.assertListEqual(
-                    list(attentions[0].shape[-4:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
-                )
-            else:
-                self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                )
-
-    def test_compile_tf_model(self):
-        # We use a simplified version of this test for EfficientFormer because it requires training=False
-        # and Keras refuses to let us force that during functional construction
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            # Prepare our model
-            model = model_class(config)
-            # These are maximally general inputs for the model, with multiple None dimensions
-            # Hopefully this will catch any conditionals that fail for flexible shapes
-            functional_inputs = {
-                key: keras.Input(shape=val.shape[1:], dtype=val.dtype, name=key)
-                for key, val in model.input_signature.items()
-                if key in model.dummy_inputs
-            }
-            outputs_dict = model(functional_inputs)
-            self.assertTrue(outputs_dict is not None)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_tf
-@require_vision
-class EfficientFormerModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return (
-            EfficientFormerImageProcessor.from_pretrained("snap-research/efficientformer-l1-300")
-            if is_vision_available()
-            else None
-        )
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = TFEfficientFormerForImageClassification.from_pretrained("snap-research/efficientformer-l1-300")
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="tf")
-        # forward pass
-        outputs = model(**inputs, training=False)
-        # verify the logits
-        expected_shape = tf.TensorShape((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-        expected_slice = tf.constant([-0.0555, 0.4825, -0.0852])
-        self.assertTrue(np.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_image_classification_head_with_teacher(self):
-        model = TFEfficientFormerForImageClassificationWithTeacher.from_pretrained(
-            "snap-research/efficientformer-l1-300"
-        )
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="tf")
-        # forward pass
-        outputs = model(**inputs, training=False)
-        # verify the logits
-        expected_shape = tf.TensorShape((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-        expected_slice = tf.constant([-0.1312, 0.4353, -1.0499])
-        self.assertTrue(np.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
diff --git a/tests/models/ernie_m/__init__.py b/tests/models/ernie_m/__init__.py
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/tests/models/ernie_m/test_modeling_ernie_m.py b/tests/models/ernie_m/test_modeling_ernie_m.py
deleted file mode 100644
index 17c9aa89f37459..00000000000000
--- a/tests/models/ernie_m/test_modeling_ernie_m.py
+++ /dev/null
@@ -1,323 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. and Baidu team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch ErnieM model."""
-
-import unittest
-
-from transformers import ErnieMConfig, is_torch_available
-from transformers.testing_utils import require_torch, slow, torch_device
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        ErnieMForInformationExtraction,
-        ErnieMForMultipleChoice,
-        ErnieMForQuestionAnswering,
-        ErnieMForSequenceClassification,
-        ErnieMForTokenClassification,
-        ErnieMModel,
-    )
-
-
-class ErnieMModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def prepare_config_and_inputs_for_uiem(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-        config = self.get_config()
-
-        return config, input_ids, input_mask
-
-    def get_config(self):
-        return ErnieMConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
-        model = ErnieMModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, return_dict=True)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = ErnieMForQuestionAnswering(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_information_extraction(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = ErnieMForInformationExtraction(config=config)
-        model.to(torch_device)
-        model.eval()
-        sequence_labels = torch.ones_like(input_ids, dtype=torch.float32)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = ErnieMForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = ErnieMForTokenClassification(config=config)
-        model.to(torch_device)
-        model.eval()
-        input_ids.to(torch_device)
-        input_mask.to(torch_device)
-        token_labels.to(torch_device)
-
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = ErnieMForMultipleChoice(config=config)
-        model.to(torch_device)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class ErnieMModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            ErnieMModel,
-            ErnieMForMultipleChoice,
-            ErnieMForQuestionAnswering,
-            ErnieMForSequenceClassification,
-            ErnieMForTokenClassification,
-        )
-        if is_torch_available()
-        else ()
-    )
-    all_generative_model_classes = ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": ErnieMModel,
-            "question-answering": ErnieMForQuestionAnswering,
-            "text-classification": ErnieMForSequenceClassification,
-            "token-classification": ErnieMForTokenClassification,
-            "zero-shot": ErnieMForSequenceClassification,
-        }
-        if is_torch_available()
-        else {}
-    )
-    test_torchscript = False
-
-    # TODO: Fix the failed tests when this model gets more usage
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if pipeline_test_casse_name == "QAPipelineTests":
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = ErnieMModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ErnieMConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_information_extraction(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_information_extraction(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "susnato/ernie-m-base_pytorch"
-        model = ErnieMModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_torch
-class ErnieMModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_model(self):
-        model = ErnieMModel.from_pretrained("susnato/ernie-m-base_pytorch")
-        model.eval()
-        input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
-        output = model(input_ids)[0]
-
-        # TODO Replace vocab size
-        hidden_size = 768
-
-        expected_shape = torch.Size((1, 6, hidden_size))
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = torch.tensor(
-            [[[-0.0012, 0.1245, -0.0214], [-0.0742, 0.0244, -0.0771], [-0.0333, 0.1164, -0.1554]]]
-        )
-
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3))
diff --git a/tests/models/ernie_m/test_tokenization_ernie_m.py b/tests/models/ernie_m/test_tokenization_ernie_m.py
deleted file mode 100644
index 01de7d37311ce4..00000000000000
--- a/tests/models/ernie_m/test_tokenization_ernie_m.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. and Baidu team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch ErnieM model."""
-
-import unittest
-
-from transformers import ErnieMTokenizer
-from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-SAMPLE_VOCAB = get_tests_dir("fixtures/spiece.model")
-
-
-@require_sentencepiece
-@require_tokenizers
-class ErnieMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "susnato/ernie-m-base_pytorch"
-    tokenizer_class = ErnieMTokenizer
-    test_seq2seq = False
-    test_sentencepiece = True
-    test_rust_tokenizer = False
-    test_sentencepiece_ignore_case = False
-
-    def setUp(self):
-        super().setUp()
-
-        # We have a SentencePiece fixture for testing
-        tokenizer = ErnieMTokenizer(SAMPLE_VOCAB, unk_token="<unk>", pad_token="<pad>")
-        tokenizer.save_pretrained(self.tmpdirname)
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "this is a test"
-        output_text = "this is a test"
-        return input_text, output_text
-
-    def test_convert_token_and_id(self):
-        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
-        token = "<pad>"
-        token_id = 0
-
-        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
-        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
-
-    def test_get_vocab(self):
-        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
-
-        self.assertEqual(vocab_keys[0], "<pad>")
-        self.assertEqual(vocab_keys[1], "<unk>")
-        self.assertEqual(vocab_keys[-1], "▁eloquent")
-        self.assertEqual(len(vocab_keys), 30_000)
-
-    def test_vocab_size(self):
-        self.assertEqual(self.get_tokenizer().vocab_size, 30_000)
-
-    def test_rust_and_python_full_tokenizers(self):
-        if not self.test_rust_tokenizer:
-            return
-
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        sequence = "I was born in 92000, and this is falsé."
-
-        tokens = tokenizer.tokenize(sequence)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-    def test_full_tokenizer(self):
-        tokenizer = ErnieMTokenizer(SAMPLE_VOCAB, do_lower_case=True, unk_token="<unk>", pad_token="<pad>")
-
-        tokens = tokenizer.tokenize("This is a test")
-        self.assertListEqual(tokens, ["▁this", "▁is", "▁a", "▁test"])
-
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [48, 25, 21, 1289])
-
-        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
-        # ErnieMTokenizer(paddlenlp implementation) outputs '9' instead of '_9' so to mimic that '_9' is changed to '9'
-        self.assertListEqual(
-            tokens, ["▁i", "▁was", "▁born", "▁in", "9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", "."]
-        )
-        ids = tokenizer.convert_tokens_to_ids(tokens)
-        self.assertListEqual(ids, [31, 23, 386, 19, 518, 3050, 15, 17, 48, 25, 8256, 18, 1, 9])
-
-        back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(
-            back_tokens,
-            ["▁i", "▁was", "▁born", "▁in", "9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "."],
-        )
-
-    def test_sequence_builders(self):
-        tokenizer = ErnieMTokenizer(SAMPLE_VOCAB, unk_token="<unk>", pad_token="<pad>")
-
-        text = tokenizer.encode("sequence builders")
-        text_2 = tokenizer.encode("multi-sequence build")
-
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-
-        assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
-        assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + [
-            tokenizer.sep_token_id
-        ] + text_2 + [tokenizer.sep_token_id]
-
-    @slow
-    def test_tokenizer_integration(self):
-        expected_encoding = {'input_ids': [[0, 11062, 82772, 7, 15, 82772, 538, 51529, 237, 17198, 1290, 206, 9, 215175, 1314, 136, 17198, 1290, 206, 9, 56359, 42, 122009, 9, 16466, 16, 87344, 4537, 9, 4717, 78381, 6, 159958, 7, 15, 24480, 618, 4, 527, 22693, 9, 304, 4, 2777, 24480, 9874, 4, 43523, 594, 4, 803, 18392, 33189, 18, 4, 43523, 24447, 5, 5, 5, 16, 100, 24955, 83658, 9626, 144057, 15, 839, 22335, 16, 136, 24955, 83658, 83479, 15, 39102, 724, 16, 678, 645, 6460, 1328, 4589, 42, 122009, 115774, 23, 3559, 1328, 46876, 7, 136, 53894, 1940, 42227, 41159, 17721, 823, 425, 4, 27512, 98722, 206, 136, 5531, 4970, 919, 17336, 5, 2], [0, 20080, 618, 83, 82775, 47, 479, 9, 1517, 73, 53894, 333, 80581, 110117, 18811, 5256, 1295, 51, 152526, 297, 7986, 390, 124416, 538, 35431, 214, 98, 15044, 25737, 136, 7108, 43701, 23, 756, 135355, 7, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 581, 63773, 119455, 6, 147797, 88203, 7, 645, 70, 21, 3285, 10269, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}  # fmt: skip
-
-        self.tokenizer_integration_test_util(
-            expected_encoding=expected_encoding,
-            model_name="susnato/ernie-m-base_pytorch",
-            sequences=[
-                "Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides "
-                "general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural "
-                "Language Understanding (NLU) and Natural Language Generation (NLG) with over32+ pretrained "
-                "models in100+ languages and deep interoperability between Jax, PyTorch and TensorFlow.",
-                "BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly "
-                "conditioning on both left and right context in all layers.",
-                "The quick brown fox jumps over the lazy dog.",
-            ],
-        )
diff --git a/tests/models/gptsan_japanese/__init__.py b/tests/models/gptsan_japanese/__init__.py
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py b/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py
deleted file mode 100644
index 8c48bb5017e9f1..00000000000000
--- a/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py
+++ /dev/null
@@ -1,476 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Toshiyuki Sakamoto(tanreinama) and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from transformers import (
-    GPTSanJapaneseConfig,
-    GPTSanJapaneseForConditionalGeneration,
-    GPTSanJapaneseModel,
-    GPTSanJapaneseTokenizer,
-    is_torch_available,
-)
-from transformers.generation import GenerationConfig
-from transformers.testing_utils import require_torch, slow, tooslow, torch_device
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-class GPTSanJapaneseTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        num_contexts=7,
-        # For common tests
-        is_training=True,
-        hidden_size=32,
-        ext_size=42,
-        num_hidden_layers=2,
-        num_ext_layers=2,
-        num_attention_heads=4,
-        num_experts=2,
-        d_ff=32,
-        d_ext=80,
-        d_spout=33,
-        dropout_rate=0.0,
-        layer_norm_epsilon=1e-6,
-        expert_capacity=100,
-        router_jitter_noise=0.0,
-    ):
-        self.vocab_size = vocab_size
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_contexts = num_contexts
-        # For common tests
-        self.seq_length = self.num_contexts
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.num_ext_layers = num_ext_layers
-        self.ext_size = ext_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_experts = num_experts
-        self.d_ff = d_ff
-        self.d_ext = d_ext
-        self.d_spout = d_spout
-        self.dropout_rate = dropout_rate
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.expert_capacity = expert_capacity
-        self.router_jitter_noise = router_jitter_noise
-
-    def get_large_model_config(self):
-        return GPTSanJapaneseConfig.from_pretrained("Tanrei/GPTSAN-japanese")
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = self.get_config()
-
-        return (config, input_ids)
-
-    def prepare_config_and_inputs_for_common(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = self.get_config()
-
-        return (config, {"input_ids": input_ids})
-
-    def get_config(self):
-        return GPTSanJapaneseConfig(
-            vocab_size=self.vocab_size,
-            num_contexts=self.seq_length,
-            d_model=self.hidden_size,
-            d_ff=self.d_ff,
-            d_ext=self.d_ext,
-            d_spout=self.d_spout,
-            num_switch_layers=self.num_hidden_layers - self.num_ext_layers,
-            num_ext_layers=self.num_ext_layers,
-            num_heads=self.num_attention_heads,
-            num_experts=self.num_experts,
-            expert_capacity=self.expert_capacity,
-            dropout_rate=self.dropout_rate,
-            layer_norm_epsilon=self.layer_norm_epsilon,
-            router_jitter_noise=self.router_jitter_noise,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-    ):
-        model = GPTSanJapaneseForConditionalGeneration(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids=input_ids,
-        )
-        self.parent.assertIsNotNone(result)
-
-
-@require_torch
-class GPTSanJapaneseTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (GPTSanJapaneseModel,) if is_torch_available() else ()
-    pipeline_model_mapping = (
-        {
-            "conversational": GPTSanJapaneseForConditionalGeneration,
-            "feature-extraction": GPTSanJapaneseForConditionalGeneration,
-            "summarization": GPTSanJapaneseForConditionalGeneration,
-            "text2text-generation": GPTSanJapaneseForConditionalGeneration,
-            "translation": GPTSanJapaneseForConditionalGeneration,
-        }
-        if is_torch_available()
-        else {}
-    )
-    fx_compatible = False
-    is_encoder_decoder = False
-    test_pruning = False
-    test_headmasking = False
-    test_save_load_fast_init_to_base = False
-    test_training = False
-    # The small GPTSAN_JAPANESE model needs higher percentages for CPU/MP tests
-    model_split_percents = [0.5, 0.8, 0.9]
-
-    # TODO: Fix the failed tests when this model gets more usage
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if pipeline_test_casse_name == "SummarizationPipelineTests":
-            # TODO: fix `_reorder_cache` is not implemented for this model
-            return True
-        elif pipeline_test_casse_name == "Text2TextGenerationPipelineTests":
-            # TODO: check this.
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = GPTSanJapaneseTester(self)
-        self.config_tester = ConfigTester(self, config_class=GPTSanJapaneseConfig, d_model=37)
-
-    def test_config(self):
-        GPTSanJapaneseConfig()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(
-        reason="skip for now as the computed `max_memory` by `model_split_percents` in the test method will be changed inside `from_pretrained`"
-    )
-    def test_model_parallelism(self):
-        super().test_model_parallelism()
-
-    @unittest.skip(reason="Gptsan does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Gptsan does not use inputs_embeds")
-    def test_inputs_embeds_matches_input_ids(self):
-        pass
-
-
-@require_torch
-class GPTSanJapaneseForConditionalGenerationTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (GPTSanJapaneseForConditionalGeneration,) if is_torch_available() else ()
-    fx_compatible = False
-    is_encoder_decoder = False
-    test_pruning = False
-    test_headmasking = False
-    # The small GPTSAN_JAPANESE model needs higher percentages for CPU/MP tests
-    model_split_percents = [0.5, 0.8, 0.9]
-
-    def setUp(self):
-        self.model_tester = GPTSanJapaneseTester(self)
-        self.config_tester = ConfigTester(self, config_class=GPTSanJapaneseConfig, d_model=37)
-
-    def test_config(self):
-        GPTSanJapaneseConfig()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(
-        reason="skip for now as the computed `max_memory` by `model_split_percents` in the test method will be changed inside `from_pretrained`"
-    )
-    def test_model_parallelism(self):
-        super().test_model_parallelism()
-
-    @unittest.skip(reason="Gptsan does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Gptsan does not use inputs_embeds")
-    def test_inputs_embeds_matches_input_ids(self):
-        pass
-
-    @slow
-    def test_logits(self):
-        model = GPTSanJapaneseForConditionalGeneration.from_pretrained("Tanrei/GPTSAN-japanese")
-        tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
-        input_ids = tokenizer.encode("武田信玄は", return_tensors="pt")
-        outputs = model(input_ids)
-        output_logits = outputs.logits.detach().cpu().numpy()
-        # Output of original model created with mesh-tensoflow
-        # fmt: off
-        target = [
-            [-12.037839889526367, -12.433061599731445, -14.333840370178223, -12.450345993041992, -11.1661376953125,
-            -11.930137634277344, -10.659740447998047, -12.909574508666992, -13.241043090820312, -13.398579597473145,
-            -11.107524871826172, -12.3685941696167, -22.97943115234375, -10.481067657470703, -12.484030723571777,
-            -12.807360649108887, -14.769700050354004, -12.233579635620117, -13.428145408630371, -22.624177932739258],
-            [-7.511149883270264, -8.281851768493652, -7.943127155303955, -7.55021333694458, -6.49869966506958,
-            -7.586796283721924, -6.978085994720459, -7.839145183563232, -8.21964168548584, -8.695091247558594,
-            -6.706910610198975, -6.6585798263549805, -19.565698623657227, -5.353842735290527, -8.350686073303223,
-            -8.039388656616211, -10.856569290161133, -7.75154447555542, -8.819022178649902, -19.51532745361328],
-            [-9.73066234588623, -10.223922729492188, -9.932981491088867, -11.857836723327637, -7.662626266479492,
-            -11.13529109954834, -7.765097618103027, -11.472923278808594, -9.543149948120117, -11.905633926391602,
-            -9.366164207458496, -11.5734281539917, -23.699003219604492, -9.429590225219727, -10.42839241027832,
-            -10.585240364074707, -10.94771957397461, -11.095416069030762, -10.390240669250488, -23.769372940063477],
-            [-9.728265762329102, -9.859712600708008, -10.09729290008545, -9.678522109985352, -6.879519939422607,
-            -9.68487548828125, -4.2803425788879395, -10.018914222717285, -9.308445930480957, -10.63394546508789,
-            -8.083646774291992, -9.06301498413086, -21.904266357421875, -8.90160846710205, -8.841876029968262,
-            -11.856719970703125, -12.079398155212402, -11.233753204345703, -10.177338600158691, -21.87256622314453],
-            [-9.669764518737793, -9.614198684692383, -9.814510345458984, -9.996501922607422, -11.375690460205078,
-            -10.113405227661133, -10.546867370605469, -10.04369068145752, -10.907809257507324, -10.504216194152832,
-            -11.129199028015137, -10.151124000549316, -21.96586799621582, -9.086349487304688, -11.730339050292969,
-            -10.460667610168457, -10.298049926757812, -10.784148216247559, -10.840693473815918, -22.03152847290039],
-        ]
-        # fmt: on
-        target = np.array(target).flatten()
-        predict = output_logits[0, :, :20].flatten()
-
-        def check(a, b, epsilon=5e-4):
-            return abs(a - b) < epsilon * max(abs(a), abs(b))
-
-        self.assertTrue(np.all([check(target[i], predict[i]) for i in range(len(target))]))
-
-    @slow
-    def test_batch_generation(self):
-        model = GPTSanJapaneseForConditionalGeneration.from_pretrained("Tanrei/GPTSAN-japanese")
-        tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
-        model.to(torch_device)
-
-        # set deterministically
-        generation_config = GenerationConfig.from_pretrained("Tanrei/GPTSAN-japanese")
-        generation_config.top_k = 1
-
-        # use different length sentences to test batching
-        sentences = [
-            "甲斐なら武田と言うほど",
-            "織田信長は、",
-        ]
-
-        tokenizer.padding_side = "left"
-        inputs = tokenizer(sentences, return_tensors="pt", padding=True)
-        input_ids = inputs["input_ids"].to(torch_device)
-
-        self.assertNotEqual(inputs["attention_mask"][0].numpy().tolist(), inputs["attention_mask"][1].numpy().tolist())
-
-        outputs = model.generate(
-            input_ids=input_ids,
-            attention_mask=inputs["attention_mask"].to(torch_device),
-            max_new_tokens=3,
-            generation_config=generation_config,
-        )
-
-        inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
-        output_non_padded = model.generate(
-            input_ids=inputs_non_padded, max_new_tokens=3, generation_config=generation_config
-        )
-
-        inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
-        output_padded = model.generate(input_ids=inputs_padded, max_new_tokens=3, generation_config=generation_config)
-
-        self.assertNotEqual(inputs_non_padded.shape, inputs_padded.shape)
-
-        batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
-        padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
-
-        expected_output_sentence = [
-            "甲斐なら武田と言うほど甲斐の武田",
-            "織田信長は、このような",
-        ]
-        self.assertListEqual(expected_output_sentence, batch_out_sentence)
-        self.assertListEqual(batch_out_sentence, [non_padded_sentence, padded_sentence])
-
-    @tooslow
-    def test_sample(self):
-        model = GPTSanJapaneseForConditionalGeneration.from_pretrained("Tanrei/GPTSAN-japanese")
-        tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
-        # Output of original model created with mesh-tensoflow
-        target = [
-            ("武田信玄は", 35675),
-            ("武田信玄は、", 45),
-            ("武田信玄は、この", 29),
-            ("武田信玄は、このよう", 30642),
-            ("武田信玄は、このような", 35680),
-            ("武田信玄は、このような「", 8640),
-            ("武田信玄は、このような「武田", 31617),
-            ("武田信玄は、このような「武田家", 30646),
-            ("武田信玄は、このような「武田家の", 31617),
-            ("武田信玄は、このような「武田家の家", 31381),
-        ]
-        for input, output in target:
-            input_ids = tokenizer.encode(input, return_tensors="pt")
-            outputs = model(input_ids)
-            output_logits = outputs.logits.detach().cpu().numpy()[0]
-            output_id = np.argmax(output_logits[-1])
-            self.assertEqual(output_id, output)
-
-    @slow
-    def test_spout_generation(self):
-        model = GPTSanJapaneseForConditionalGeneration.from_pretrained("Tanrei/GPTSAN-japanese")
-        tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
-        model.to(torch_device)
-
-        # set deterministically
-        generation_config = GenerationConfig.from_pretrained("Tanrei/GPTSAN-japanese")
-        generation_config.top_k = 1
-
-        input_text = "武田信玄は、"
-        input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(torch_device)
-        input_ids_batch = tokenizer([input_text, input_text], return_tensors="pt").input_ids.to(torch_device)
-
-        # spout from uniform and one-hot
-
-        spouts = [
-            [0.87882208, 0.38426396, 0.33220248, 0.43890406, 0.16562252,
-            0.04803985, 0.211572  , 0.23188473, 0.37153068, 0.7836377 ,
-            0.02160172, 0.38761719, 0.75290772, 0.90198857, 0.34365777,
-            0.64168169, 0.44318471, 0.14575746, 0.92562881, 0.40812148,
-            0.29019122, 0.88861599, 0.65524846, 0.43563456, 0.38177187,
-            0.70832965, 0.81527892, 0.68832812, 0.38833192, 0.4561522 ,
-            0.14828817, 0.47248213, 0.54357335, 0.82009566, 0.1338884 ,
-            0.02755417, 0.19764677, 0.2422084 , 0.04757674, 0.65409606,
-            0.0824589 , 0.03304383, 0.94387689, 0.98764509, 0.82433901,
-            0.27646741, 0.64907493, 0.76009406, 0.30087915, 0.17904689,
-            0.41601714, 0.67046398, 0.10422822, 0.08447374, 0.07354344,
-            0.61423565, 0.70284866, 0.7532333 , 0.1972038 , 0.29575659,
-            0.90583886, 0.29265307, 0.50000175, 0.70407655, 0.889363  ,
-            0.81904418, 0.66829128, 0.64468815, 0.56563723, 0.85601875,
-            0.94924672, 0.00166762, 0.25220643, 0.74540219, 0.67993247,
-            0.1549675 , 0.39385352, 0.92153607, 0.63745931, 0.27759043,
-            0.84702295, 0.65904271, 0.58676614, 0.8666936 , 0.39607438,
-            0.79954983, 0.42220697, 0.39650381, 0.7849864 , 0.56150201,
-            0.15678925, 0.14746032, 0.34542114, 0.47026783, 0.11956489,
-            0.25421435, 0.33788901, 0.68934842, 0.36424685, 0.71737898,
-            0.38983449, 0.94393779, 0.39575588, 0.36616553, 0.87104665,
-            0.64630203, 0.22516905, 0.88270804, 0.15031338, 0.75144345,
-            0.46459025, 0.85396454, 0.86355643, 0.65139851, 0.70266061,
-            0.30241389, 0.81056497, 0.88865969, 0.38773807, 0.70635849,
-            0.90718459, 0.43245789, 0.28000654, 0.45935562, 0.08773519,
-            0.9552151 , 0.93901511, 0.22489288], # uniform
-            [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-             0., 0., 0., 0., 0., 0., 0., 0.],
-        ]  # fmt: skip
-
-        output1 = model.generate(
-            input_ids=input_ids,
-            spout=spouts[0],
-            max_new_tokens=20,
-            generation_config=generation_config,
-        )
-
-        output2 = model.generate(
-            input_ids=input_ids,
-            spout=spouts[1],
-            max_new_tokens=20,
-            generation_config=generation_config,
-        )
-
-        output3 = model.generate(
-            input_ids=input_ids_batch,
-            spout=spouts,
-            max_new_tokens=20,
-            generation_config=generation_config,
-        )
-
-        out1_sentence = tokenizer.decode(output1[0])
-        out2_sentence = tokenizer.decode(output2[0])
-        batch_out_sentence = tokenizer.batch_decode(output3)
-
-        expected_output_sentence = [
-            "武田信玄は、武田氏の滅亡後、武田氏の居城であった甲斐武田氏の居城である",
-            "武田信玄は、武田家の滅亡を防ぐため、武田家の家臣である武田信虎を討",
-        ]
-        self.assertListEqual(expected_output_sentence, batch_out_sentence)
-        self.assertListEqual(batch_out_sentence, [out1_sentence, out2_sentence])
-
-    @slow
-    def test_prefix_lm_generation(self):
-        model = GPTSanJapaneseForConditionalGeneration.from_pretrained("Tanrei/GPTSAN-japanese")
-        tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
-        model.to(torch_device)
-
-        # set deterministically
-        generation_config = GenerationConfig.from_pretrained("Tanrei/GPTSAN-japanese")
-        generation_config.top_k = 1
-
-        prefix_text_1 = "武田信玄"
-        prefix_text_2 = "織田信長"
-        input_text_1 = "は、"
-        input_text_2 = "が、"
-        input_tok_1 = tokenizer(input_text_1, prefix_text=prefix_text_1, return_tensors="pt")
-        input_tok_2 = tokenizer(input_text_2, prefix_text=prefix_text_2, return_tensors="pt")
-        input_tok_3 = tokenizer([[prefix_text_1, input_text_1], [prefix_text_2, input_text_2]], return_tensors="pt")
-
-        output1 = model.generate(
-            input_ids=input_tok_1.input_ids.to(torch_device),
-            token_type_ids=input_tok_1.token_type_ids.to(torch_device),
-            max_new_tokens=20,
-            generation_config=generation_config,
-        )
-
-        output2 = model.generate(
-            input_ids=input_tok_2.input_ids.to(torch_device),
-            token_type_ids=input_tok_2.token_type_ids.to(torch_device),
-            max_new_tokens=20,
-            generation_config=generation_config,
-        )
-
-        output3 = model.generate(
-            input_ids=input_tok_3.input_ids.to(torch_device),
-            token_type_ids=input_tok_3.token_type_ids.to(torch_device),
-            attention_mask=input_tok_3.attention_mask.to(torch_device),
-            max_new_tokens=20,
-            generation_config=generation_config,
-        )
-
-        out1_sentence = tokenizer.decode(output1[0])
-        out2_sentence = tokenizer.decode(output2[0])
-        batch_out_sentence = tokenizer.batch_decode(output3)
-
-        expected_output_sentence = [
-            "武田信玄は、武田氏の祖である武田信虎を、その子・武田信友を擁して",
-            "織田信長が、織田信長の妻・お市の方を妻として迎えたという逸話が残",
-        ]
-        self.assertListEqual(expected_output_sentence, batch_out_sentence)
-        self.assertListEqual(batch_out_sentence, [out1_sentence, out2_sentence])
diff --git a/tests/models/gptsan_japanese/test_tokenization_gptsan_japanese.py b/tests/models/gptsan_japanese/test_tokenization_gptsan_japanese.py
deleted file mode 100644
index 8d989a51a732fa..00000000000000
--- a/tests/models/gptsan_japanese/test_tokenization_gptsan_japanese.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Toshiyuki Sakamoto(tanreinama) and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import os
-import unittest
-
-from transformers.models.gptsan_japanese.tokenization_gptsan_japanese import (
-    VOCAB_FILES_NAMES,
-    GPTSanJapaneseTokenizer,
-)
-from transformers.testing_utils import require_jinja, require_tokenizers, slow
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-@require_tokenizers
-class GPTSanJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "Tanrei/GPTSAN-japanese"
-    tokenizer_class = GPTSanJapaneseTokenizer
-    test_rust_tokenizer = False
-    from_pretrained_kwargs = {"do_clean_text": False, "add_prefix_space": False}
-
-    def setUp(self):
-        super().setUp()
-
-        vocab_tokens = ["こん", "こんに", "にちは", "ばんは", "世界,㔺界", "、", "。", "<BR>", "<SP>", "<TAB>", "<URL>", "<EMAIL>", "<TEL>", "<DATE>", "<PRICE>", "<BLOCK>", "<KIGOU>", "<U2000U2BFF>", "<|emoji1|>", "<unk>", "<|bagoftoken|>", "<|endoftext|>"]  # fmt: skip
-        emoji_tokens = {"emoji": {"\ud83d\ude00": "<|emoji1|>"}, "emoji_inv": {"<|emoji1|>": "\ud83d\ude00"}}  # 😀
-        self.special_tokens_map = {"unk_token": "<unk>"}
-
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.emoji_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["emoji_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-        with open(self.emoji_file, "w") as emoji_writer:
-            emoji_writer.write(json.dumps(emoji_tokens))
-
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return GPTSanJapaneseTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.get_input_output_texts
-    def get_input_output_texts(self, tokenizer):
-        input_text = "こんにちは、世界。 \nこんばんは、㔺界。😀"
-        output_text = "こんにちは、世界。 \nこんばんは、世界。😀"
-        return input_text, output_text
-
-    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.get_clean_sequence
-    def get_clean_sequence(self, tokenizer):
-        input_text, output_text = self.get_input_output_texts(tokenizer)
-        ids = tokenizer.encode(output_text, add_special_tokens=False)
-        text = tokenizer.decode(ids, clean_up_tokenization_spaces=False)
-        return text, ids
-
-    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.test_pretokenized_inputs
-    def test_pretokenized_inputs(self):
-        pass  # TODO add if relevant
-
-    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.test_maximum_encoding_length_pair_input
-    def test_maximum_encoding_length_pair_input(self):
-        pass  # TODO add if relevant
-
-    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.test_maximum_encoding_length_single_input
-    def test_maximum_encoding_length_single_input(self):
-        pass  # TODO add if relevant
-
-    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.test_full_tokenizer
-    def test_full_tokenizer(self):
-        tokenizer = self.get_tokenizer()
-
-        # Testing tokenization
-        input_text = "こんにちは、世界。　こんばんは、㔺界。"
-        expected_token = ["こん", "にちは", "、", "世界", "。", "<SP>", "こん", "ばんは", "、", "㔺界", "。"]
-        tokens = tokenizer.tokenize(input_text)
-        self.assertListEqual(tokens, expected_token)
-
-        # Testing conversion to ids without special tokens
-        expected_ids = [0, 2, 5, 4, 6, 8, 0, 3, 5, 4, 6]
-        input_ids = tokenizer.convert_tokens_to_ids(tokens)
-        self.assertListEqual(input_ids, expected_ids)
-
-        # Testing conversion to ids with special tokens
-        input_tokens = tokens + [tokenizer.unk_token]
-        expected_ids = [0, 2, 5, 4, 6, 8, 0, 3, 5, 4, 6, 19]
-        input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
-        self.assertListEqual(input_ids, expected_ids)
-
-    def test_token_bagging(self):
-        tokenizer = self.get_tokenizer()
-
-        # Testing tokenization
-        input_text = "こんにちは、<|bagoftoken|>世界。こんばんは、<|bagoftoken|>㔺界。"
-        expected_text = "こんにちは、、、、世界。こんばんは、、、、世界。"
-        tokens = tokenizer.encode(input_text)
-        output_text = tokenizer.decode(tokens)
-        self.assertEqual(output_text, expected_text)
-
-    @slow
-    def test_prefix_input(self):
-        tokenizer = self.tokenizer_class.from_pretrained("Tanrei/GPTSAN-japanese")
-
-        # Testing tokenization
-        prefix_text = "こんにちは、世界。"
-        input_text = "こんばんは、㔺界。😀"
-        expected_text = "こんにちは、世界。こんばんは、世界。😀"
-        tokens_1 = tokenizer.encode(prefix_text + input_text)
-        tokens_2 = tokenizer.encode("", prefix_text=prefix_text + input_text)
-        tokens_3 = tokenizer.encode(input_text, prefix_text=prefix_text)
-        output_text_1 = tokenizer.decode(tokens_1)
-        output_text_2 = tokenizer.decode(tokens_2)
-        output_text_3 = tokenizer.decode(tokens_3)
-        self.assertEqual(output_text_1, expected_text)
-        self.assertEqual(output_text_2, expected_text)
-        self.assertEqual(output_text_3, expected_text)
-
-    @slow
-    def test_token_type_ids(self):
-        tokenizer = self.tokenizer_class.from_pretrained("Tanrei/GPTSAN-japanese")
-
-        # Testing tokenization
-        prefix_text = "こんにちは、世界。"
-        input_text = "こんばんは、㔺界。😀"
-
-        len_prefix = len(tokenizer.encode(prefix_text)) - 2
-        len_text = len(tokenizer.encode(input_text)) - 2
-
-        expected_mask_1 = [1] + [0] * (len_prefix + len_text + 1)
-        expected_mask_2 = [1] * (len_prefix + len_text + 1) + [0]
-        expected_mask_3 = [1] + [1] * (len_prefix) + [0] * (len_text + 1)
-
-        type_id_1 = tokenizer(prefix_text + input_text).token_type_ids
-        type_id_2 = tokenizer("", prefix_text=prefix_text + input_text).token_type_ids
-        type_id_3 = tokenizer(input_text, prefix_text=prefix_text).token_type_ids
-        self.assertListEqual(type_id_1, expected_mask_1)
-        self.assertListEqual(type_id_2, expected_mask_2)
-        self.assertListEqual(type_id_3, expected_mask_3)
-
-    @slow
-    def test_prefix_tokens(self):
-        tokenizer = self.tokenizer_class.from_pretrained("Tanrei/GPTSAN-japanese")
-
-        x_token_1 = tokenizer.encode("あンいワ")
-        x_token_2 = tokenizer.encode("", prefix_text="あンいワ")
-        x_token_3 = tokenizer.encode("いワ", prefix_text="あン")
-
-        self.assertEqual(tokenizer.decode(x_token_1), tokenizer.decode(x_token_2))
-        self.assertEqual(tokenizer.decode(x_token_1), tokenizer.decode(x_token_3))
-        self.assertNotEqual(x_token_1, x_token_2)
-        self.assertNotEqual(x_token_1, x_token_3)
-        self.assertEqual(x_token_1[1], x_token_2[-1])  # SEG token
-        self.assertEqual(x_token_1[1], x_token_3[3])  # SEG token
-
-    @slow
-    def test_batch_encode(self):
-        tokenizer = self.tokenizer_class.from_pretrained("Tanrei/GPTSAN-japanese")
-
-        input_pairs = [["武田信玄", "は、"], ["織田信長", "の配下の、"]]
-        x_token = tokenizer(input_pairs, padding=True)
-        x_token_2 = tokenizer.batch_encode_plus(input_pairs, padding=True)
-
-        # fmt: off
-        expected_outputs = [[35993, 8640, 25948, 35998, 30647, 35675, 35999, 35999], [35993, 10382, 9868, 35998, 30646, 9459, 30646, 35675]]
-        expected_typeids = [[1, 1, 1, 0, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0, 0]]
-        expected_attmask = [[1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1]]
-        # fmt: on
-        self.assertListEqual(x_token.input_ids, expected_outputs)
-        self.assertListEqual(x_token.token_type_ids, expected_typeids)
-        self.assertListEqual(x_token.attention_mask, expected_attmask)
-        self.assertListEqual(x_token_2.input_ids, expected_outputs)
-        self.assertListEqual(x_token_2.token_type_ids, expected_typeids)
-        self.assertListEqual(x_token_2.attention_mask, expected_attmask)
-
-    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.test_conversion_reversible
-    def test_conversion_reversible(self):
-        # Intentionally convert some words to accommodate character fluctuations unique to Japanese
-        pass
-
-    # Copied from tests.models.gpt_neox_japanese.test_tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizationTest.test_padding_different_model_input_name
-    def test_padding_different_model_input_name(self):
-        # tokenizer has no padding token
-        pass
-
-    @require_jinja
-    def test_tokenization_for_chat(self):
-        tokenizer = self.tokenizer_class.from_pretrained("Tanrei/GPTSAN-japanese")
-        # This is in English, but it's just here to make sure the chat control tokens are being added properly
-        test_chats = [
-            [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
-            [
-                {"role": "system", "content": "You are a helpful chatbot."},
-                {"role": "user", "content": "Hello!"},
-                {"role": "assistant", "content": "Nice to meet you."},
-            ],
-            [{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}],
-        ]
-        tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
-        # fmt: off
-        expected_tokens = [
-            [35993, 35998, 35637, 35659, 35665, 35716, 35645, 35662, 35649, 35716, 35645, 35716, 35652, 35649, 35656, 35660, 35650, 35665, 35656, 35716, 35647, 35652, 35645, 35664, 35646, 35659, 35664, 35595, 35716, 35999, 35993, 35998, 35620, 35649, 35656, 35656, 35659, 35582, 35716, 35999],
-            [35993, 35998, 35637, 35659, 35665, 35716, 35645, 35662, 35649, 35716, 35645, 35716, 35652, 35649, 35656, 35660, 35650, 35665, 35656, 35716, 35647, 35652, 35645, 35664, 35646, 35659, 35664, 35595, 35716, 35999, 35993, 35998, 35620, 35649, 35656, 35656, 35659, 35582, 35716, 35999, 35993, 35998, 35626, 35653, 35647, 35649, 35716, 35664, 35659, 35716, 35657, 35649, 35649, 35664, 35716, 35669, 35659, 35665, 35595, 35716, 35999],
-            [35993, 35998, 35626, 35653, 35647, 35649, 35716, 35664, 35659, 35716, 35657, 35649, 35649, 35664, 35716, 35669, 35659, 35665, 35595, 35716, 35999, 35993, 35998, 35620, 35649, 35656, 35656, 35659, 35582, 35716, 35999]
-        ]
-        # fmt: on
-        for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
-            self.assertListEqual(tokenized_chat, expected_tokens)
diff --git a/tests/models/graphormer/__init__.py b/tests/models/graphormer/__init__.py
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/tests/models/graphormer/test_modeling_graphormer.py b/tests/models/graphormer/test_modeling_graphormer.py
deleted file mode 100644
index 55b1ccc34a11a7..00000000000000
--- a/tests/models/graphormer/test_modeling_graphormer.py
+++ /dev/null
@@ -1,1300 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch Graphormer model."""
-
-import copy
-import inspect
-import os
-import tempfile
-import unittest
-
-from transformers import GraphormerConfig, is_torch_available
-from transformers.testing_utils import require_torch, slow, torch_device
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-    from torch import tensor
-
-    from transformers import GraphormerForGraphClassification, GraphormerModel
-
-
-class GraphormerModelTester:
-    def __init__(
-        self,
-        parent,
-        num_classes=1,
-        num_atoms=32 * 9,
-        num_edges=32 * 3,
-        num_in_degree=32,
-        num_out_degree=32,
-        num_spatial=32,
-        num_edge_dis=16,
-        multi_hop_max_dist=5,  # sometimes is 20
-        spatial_pos_max=32,
-        edge_type="multi_hop",
-        init_fn=None,
-        max_nodes=32,
-        share_input_output_embed=False,
-        num_hidden_layers=2,
-        embedding_dim=32,
-        ffn_embedding_dim=32,
-        num_attention_heads=4,
-        dropout=0.1,
-        attention_dropout=0.1,
-        activation_dropout=0.1,
-        layerdrop=0.0,
-        encoder_normalize_before=False,
-        pre_layernorm=False,
-        apply_graphormer_init=False,
-        activation_fn="gelu",
-        embed_scale=None,
-        freeze_embeddings=False,
-        num_trans_layers_to_freeze=0,
-        traceable=False,
-        q_noise=0.0,
-        qn_block_size=8,
-        kdim=None,
-        vdim=None,
-        bias=True,
-        self_attention=True,
-        batch_size=10,
-        graph_size=20,
-        is_training=True,
-    ):
-        self.parent = parent
-        self.num_classes = num_classes
-        self.num_labels = num_classes
-        self.num_atoms = num_atoms
-        self.num_in_degree = num_in_degree
-        self.num_out_degree = num_out_degree
-        self.num_edges = num_edges
-        self.num_spatial = num_spatial
-        self.num_edge_dis = num_edge_dis
-        self.edge_type = edge_type
-        self.multi_hop_max_dist = multi_hop_max_dist
-        self.spatial_pos_max = spatial_pos_max
-        self.max_nodes = max_nodes
-        self.num_hidden_layers = num_hidden_layers
-        self.embedding_dim = embedding_dim
-        self.hidden_size = embedding_dim
-        self.ffn_embedding_dim = ffn_embedding_dim
-        self.num_attention_heads = num_attention_heads
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.activation_dropout = activation_dropout
-        self.layerdrop = layerdrop
-        self.encoder_normalize_before = encoder_normalize_before
-        self.pre_layernorm = pre_layernorm
-        self.apply_graphormer_init = apply_graphormer_init
-        self.activation_fn = activation_fn
-        self.embed_scale = embed_scale
-        self.freeze_embeddings = freeze_embeddings
-        self.num_trans_layers_to_freeze = num_trans_layers_to_freeze
-        self.share_input_output_embed = share_input_output_embed
-        self.traceable = traceable
-        self.q_noise = q_noise
-        self.qn_block_size = qn_block_size
-        self.init_fn = init_fn
-        self.kdim = kdim
-        self.vdim = vdim
-        self.self_attention = self_attention
-        self.bias = bias
-        self.batch_size = batch_size
-        self.graph_size = graph_size
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        attn_bias = ids_tensor(
-            [self.batch_size, self.graph_size + 1, self.graph_size + 1], self.num_atoms
-        )  # Def not sure here
-        attn_edge_type = ids_tensor([self.batch_size, self.graph_size, self.graph_size, 1], self.num_edges)
-        spatial_pos = ids_tensor([self.batch_size, self.graph_size, self.graph_size], self.num_spatial)
-        in_degree = ids_tensor([self.batch_size, self.graph_size], self.num_in_degree)
-        out_degree = ids_tensor([self.batch_size, self.graph_size], self.num_out_degree)
-        input_nodes = ids_tensor([self.batch_size, self.graph_size, 1], self.num_atoms)
-        input_edges = ids_tensor(
-            [self.batch_size, self.graph_size, self.graph_size, self.multi_hop_max_dist, 1], self.num_edges
-        )
-        labels = ids_tensor([self.batch_size], self.num_classes)
-
-        config = self.get_config()
-
-        return config, attn_bias, attn_edge_type, spatial_pos, in_degree, out_degree, input_nodes, input_edges, labels
-
-    def get_config(self):
-        return GraphormerConfig(
-            num_atoms=self.num_atoms,
-            num_in_degree=self.num_in_degree,
-            num_out_degree=self.num_out_degree,
-            num_edges=self.num_edges,
-            num_spatial=self.num_spatial,
-            num_edge_dis=self.num_edge_dis,
-            edge_type=self.edge_type,
-            multi_hop_max_dist=self.multi_hop_max_dist,
-            spatial_pos_max=self.spatial_pos_max,
-            max_nodes=self.max_nodes,
-            num_hidden_layers=self.num_hidden_layers,
-            embedding_dim=self.embedding_dim,
-            hidden_size=self.embedding_dim,
-            ffn_embedding_dim=self.ffn_embedding_dim,
-            num_attention_heads=self.num_attention_heads,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            activation_dropout=self.activation_dropout,
-            layerdrop=self.layerdrop,
-            encoder_normalize_before=self.encoder_normalize_before,
-            pre_layernorm=self.pre_layernorm,
-            apply_graphormer_init=self.apply_graphormer_init,
-            activation_fn=self.activation_fn,
-            embed_scale=self.embed_scale,
-            freeze_embeddings=self.freeze_embeddings,
-            num_trans_layers_to_freeze=self.num_trans_layers_to_freeze,
-            share_input_output_embed=self.share_input_output_embed,
-            traceable=self.traceable,
-            q_noise=self.q_noise,
-            qn_block_size=self.qn_block_size,
-            init_fn=self.init_fn,
-            kdim=self.kdim,
-            vdim=self.vdim,
-            self_attention=self.self_attention,
-            bias=self.bias,
-        )
-
-    def create_and_check_model(
-        self, config, attn_bias, attn_edge_type, spatial_pos, in_degree, out_degree, input_nodes, input_edges, labels
-    ):
-        model = GraphormerModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_nodes=input_nodes,
-            attn_bias=attn_bias,
-            in_degree=in_degree,
-            out_degree=out_degree,
-            spatial_pos=spatial_pos,
-            input_edges=input_edges,
-            attn_edge_type=attn_edge_type,
-            labels=labels,
-        )
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.graph_size + 1, self.hidden_size)
-        )
-
-    def create_and_check_for_graph_classification(
-        self, config, attn_bias, attn_edge_type, spatial_pos, in_degree, out_degree, input_nodes, input_edges, labels
-    ):
-        model = GraphormerForGraphClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_nodes=input_nodes,
-            attn_bias=attn_bias,
-            in_degree=in_degree,
-            out_degree=out_degree,
-            spatial_pos=spatial_pos,
-            input_edges=input_edges,
-            attn_edge_type=attn_edge_type,
-            labels=labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            attn_bias,
-            attn_edge_type,
-            spatial_pos,
-            in_degree,
-            out_degree,
-            input_nodes,
-            input_edges,
-            labels,
-        ) = config_and_inputs
-        inputs_dict = {
-            "attn_bias": attn_bias,
-            "attn_edge_type": attn_edge_type,
-            "spatial_pos": spatial_pos,
-            "in_degree": in_degree,
-            "out_degree": out_degree,
-            "input_nodes": input_nodes,
-            "input_edges": input_edges,
-            "labels": labels,
-        }
-        return config, inputs_dict
-
-
-@require_torch
-class GraphormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (GraphormerForGraphClassification, GraphormerModel) if is_torch_available() else ()
-    all_generative_model_classes = ()
-    pipeline_model_mapping = {"feature-extraction": GraphormerModel} if is_torch_available() else {}
-    test_pruning = False
-    test_head_masking = False
-    test_resize_embeddings = False
-    main_input_name_nodes = "input_nodes"
-    main_input_name_edges = "input_edges"
-    has_attentions = False  # does not output attention
-
-    def setUp(self):
-        self.model_tester = GraphormerModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=GraphormerConfig, has_text_modality=False)
-
-    # overwrite from common as `Graphormer` requires more input arguments
-    def _create_and_check_torchscript(self, config, inputs_dict):
-        if not self.test_torchscript:
-            return
-
-        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-        configs_no_init.torchscript = True
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            model.to(torch_device)
-            model.eval()
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-
-            try:
-                required_keys = (
-                    "input_nodes",
-                    "input_edges",
-                    "attn_bias",
-                    "in_degree",
-                    "out_degree",
-                    "spatial_pos",
-                    "attn_edge_type",
-                )
-                required_inputs = tuple(inputs[k] for k in required_keys)
-                model(*required_inputs)
-                traced_model = torch.jit.trace(model, required_inputs)
-            except RuntimeError:
-                self.fail("Couldn't trace module.")
-
-            with tempfile.TemporaryDirectory() as tmp_dir_name:
-                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
-
-                try:
-                    torch.jit.save(traced_model, pt_file_name)
-                except Exception:
-                    self.fail("Couldn't save module.")
-
-                try:
-                    loaded_model = torch.jit.load(pt_file_name)
-                except Exception:
-                    self.fail("Couldn't load module.")
-
-            model.to(torch_device)
-            model.eval()
-
-            loaded_model.to(torch_device)
-            loaded_model.eval()
-
-            model_state_dict = model.state_dict()
-            loaded_model_state_dict = loaded_model.state_dict()
-
-            non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
-                    non_persistent_buffers[key] = loaded_model_state_dict[key]
-
-            loaded_model_state_dict = {
-                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
-            }
-
-            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
-
-            model_buffers = list(model.buffers())
-            for non_persistent_buffer in non_persistent_buffers.values():
-                found_buffer = False
-                for i, model_buffer in enumerate(model_buffers):
-                    if torch.equal(non_persistent_buffer, model_buffer):
-                        found_buffer = True
-                        break
-
-                self.assertTrue(found_buffer)
-                model_buffers.pop(i)
-
-            model_buffers = list(model.buffers())
-            for non_persistent_buffer in non_persistent_buffers.values():
-                found_buffer = False
-                for i, model_buffer in enumerate(model_buffers):
-                    if torch.equal(non_persistent_buffer, model_buffer):
-                        found_buffer = True
-                        break
-
-                self.assertTrue(found_buffer)
-                model_buffers.pop(i)
-
-            models_equal = True
-            for layer_name, p1 in model_state_dict.items():
-                if layer_name in loaded_model_state_dict:
-                    p2 = loaded_model_state_dict[layer_name]
-                    if p1.data.ne(p2.data).sum() > 0:
-                        models_equal = False
-
-            self.assertTrue(models_equal)
-
-            # Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
-            # (Even with this call, there are still memory leak by ~0.04MB)
-            self.clear_torch_jit_class_registry()
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="Graphormer does not use one single inputs_embedding but three")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Graphormer does not implement feed forward chunking")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @unittest.skip(reason="Graphormer does not share input and output embeddings")
-    def test_model_common_attributes(self):
-        pass
-
-    def test_initialization(self):
-        def _config_zero_init(config):
-            configs_no_init = copy.deepcopy(config)
-            for key in configs_no_init.__dict__.keys():
-                if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
-                    setattr(configs_no_init, key, 1e-10)
-            return configs_no_init
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    self.assertTrue(
-                        -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            batch_size = self.model_tester.batch_size
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [batch_size, self.model_tester.hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            # Always returns hidden_states
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = False
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-        model.to(torch_device)
-
-        outputs = model(**inputs_dict)
-        output = outputs[0]
-
-        hidden_states = outputs.hidden_states[0]
-        hidden_states.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(hidden_states.grad)
-
-    # Inputs are 'input_nodes' and 'input_edges' not 'input_ids'
-    def test_model_main_input_name(self):
-        for model_class in self.all_model_classes:
-            model_signature = inspect.signature(getattr(model_class, "forward"))
-            # The main input is the name of the argument after `self`
-            observed_main_input_name_nodes = list(model_signature.parameters.keys())[1]
-            observed_main_input_name_edges = list(model_signature.parameters.keys())[2]
-            self.assertEqual(model_class.main_input_name_nodes, observed_main_input_name_nodes)
-            self.assertEqual(model_class.main_input_name_edges, observed_main_input_name_edges)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["input_nodes", "input_edges"]
-            self.assertListEqual(arg_names[:2], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_graph_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_graph_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "clefourrier/graphormer-base-pcqm4mv1"
-        model = GraphormerForGraphClassification.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_torch
-class GraphormerModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_graph_classification(self):
-        model = GraphormerForGraphClassification.from_pretrained("clefourrier/graphormer-base-pcqm4mv2")
-
-        # Actual real graph data from the MUTAG dataset
-        # fmt: off
-        model_input = {
-            "attn_bias": tensor(
-                [
-                    [
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                    ],
-                    [
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                    ],
-                ]
-            ),
-            "attn_edge_type": tensor(
-                [
-                    [
-                        [[0], [3], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [3], [0], [3], [0], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [3], [0], [3], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[3], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [3], [0], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [3], [0], [3], [0], [0], [0], [3], [0], [0], [0]],
-                        [[0], [0], [0], [3], [0], [0], [0], [0], [3], [0], [3], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [3], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [3], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [3], [3], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0], [0], [3], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0], [3], [3]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0]],
-                    ],
-                    [
-                        [[0], [3], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0], [0]],
-                        [[3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [3], [0], [3], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [3], [0], [0], [0], [3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [3], [0], [3], [3], [0], [0], [0], [0], [0], [0]],
-                        [[3], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0], [3], [3], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                    ],
-                ]
-            ),
-            # fmt: on
-            "spatial_pos": tensor(
-                [
-                    [
-                        [1, 2, 3, 4, 3, 2, 4, 5, 6, 5, 6, 7, 8, 7, 9, 10, 10],
-                        [2, 1, 2, 3, 4, 3, 5, 6, 5, 4, 5, 6, 7, 6, 8, 9, 9],
-                        [3, 2, 1, 2, 3, 4, 4, 5, 4, 3, 4, 5, 6, 5, 7, 8, 8],
-                        [4, 3, 2, 1, 2, 3, 3, 4, 3, 2, 3, 4, 5, 4, 6, 7, 7],
-                        [3, 4, 3, 2, 1, 2, 2, 3, 4, 3, 4, 5, 6, 5, 7, 8, 8],
-                        [2, 3, 4, 3, 2, 1, 3, 4, 5, 4, 5, 6, 7, 6, 8, 9, 9],
-                        [4, 5, 4, 3, 2, 3, 1, 2, 3, 4, 5, 6, 5, 4, 6, 7, 7],
-                        [5, 6, 5, 4, 3, 4, 2, 1, 2, 3, 4, 5, 4, 3, 5, 6, 6],
-                        [6, 5, 4, 3, 4, 5, 3, 2, 1, 2, 3, 4, 3, 2, 4, 5, 5],
-                        [5, 4, 3, 2, 3, 4, 4, 3, 2, 1, 2, 3, 4, 3, 5, 6, 6],
-                        [6, 5, 4, 3, 4, 5, 5, 4, 3, 2, 1, 2, 3, 4, 4, 5, 5],
-                        [7, 6, 5, 4, 5, 6, 6, 5, 4, 3, 2, 1, 2, 3, 3, 4, 4],
-                        [8, 7, 6, 5, 6, 7, 5, 4, 3, 4, 3, 2, 1, 2, 2, 3, 3],
-                        [7, 6, 5, 4, 5, 6, 4, 3, 2, 3, 4, 3, 2, 1, 3, 4, 4],
-                        [9, 8, 7, 6, 7, 8, 6, 5, 4, 5, 4, 3, 2, 3, 1, 2, 2],
-                        [10, 9, 8, 7, 8, 9, 7, 6, 5, 6, 5, 4, 3, 4, 2, 1, 3],
-                        [10, 9, 8, 7, 8, 9, 7, 6, 5, 6, 5, 4, 3, 4, 2, 3, 1],
-                    ],
-                    [
-                        [1, 2, 3, 4, 5, 6, 5, 4, 3, 2, 4, 5, 5, 0, 0, 0, 0],
-                        [2, 1, 2, 3, 4, 5, 4, 3, 4, 3, 5, 6, 6, 0, 0, 0, 0],
-                        [3, 2, 1, 2, 3, 4, 3, 2, 3, 4, 4, 5, 5, 0, 0, 0, 0],
-                        [4, 3, 2, 1, 2, 3, 4, 3, 4, 5, 5, 6, 6, 0, 0, 0, 0],
-                        [5, 4, 3, 2, 1, 2, 3, 4, 5, 6, 6, 7, 7, 0, 0, 0, 0],
-                        [6, 5, 4, 3, 2, 1, 2, 3, 4, 5, 5, 6, 6, 0, 0, 0, 0],
-                        [5, 4, 3, 4, 3, 2, 1, 2, 3, 4, 4, 5, 5, 0, 0, 0, 0],
-                        [4, 3, 2, 3, 4, 3, 2, 1, 2, 3, 3, 4, 4, 0, 0, 0, 0],
-                        [3, 4, 3, 4, 5, 4, 3, 2, 1, 2, 2, 3, 3, 0, 0, 0, 0],
-                        [2, 3, 4, 5, 6, 5, 4, 3, 2, 1, 3, 4, 4, 0, 0, 0, 0],
-                        [4, 5, 4, 5, 6, 5, 4, 3, 2, 3, 1, 2, 2, 0, 0, 0, 0],
-                        [5, 6, 5, 6, 7, 6, 5, 4, 3, 4, 2, 1, 3, 0, 0, 0, 0],
-                        [5, 6, 5, 6, 7, 6, 5, 4, 3, 4, 2, 3, 1, 0, 0, 0, 0],
-                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                    ],
-                ]
-            ),
-            "in_degree": tensor(
-                [
-                    [3, 3, 3, 4, 4, 3, 3, 3, 4, 4, 3, 3, 4, 3, 4, 2, 2],
-                    [3, 3, 4, 3, 3, 3, 3, 4, 4, 3, 4, 2, 2, 0, 0, 0, 0],
-                ]
-            ),
-            "out_degree": tensor(
-                [
-                    [3, 3, 3, 4, 4, 3, 3, 3, 4, 4, 3, 3, 4, 3, 4, 2, 2],
-                    [3, 3, 4, 3, 3, 3, 3, 4, 4, 3, 4, 2, 2, 0, 0, 0, 0],
-                ]
-            ),
-            "input_nodes": tensor(
-                [
-                    [[3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3]],
-                    [[3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [0], [0], [0], [0]],
-                ]
-            ),
-            "input_edges": tensor(
-                [
-                    [
-                        [
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                        ],
-                        [
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                        ],
-                        [
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                        ],
-                        [
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                        ],
-                        [
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                        ],
-                        [
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                        ],
-                        [
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                    ],
-                    [
-                        [
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                    ],
-                ]
-            ),
-            "labels": tensor([1, 0]),
-        }
-
-        output = model(**model_input)["logits"]
-
-        expected_shape = torch.Size((2, 1))
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_logs = torch.tensor(
-            [[7.6060], [7.4126]]
-        )
-
-        self.assertTrue(torch.allclose(output, expected_logs, atol=1e-4))
diff --git a/tests/models/jukebox/__init__.py b/tests/models/jukebox/__init__.py
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
deleted file mode 100644
index f064f442fcdc4d..00000000000000
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ /dev/null
@@ -1,407 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-from unittest import skip
-
-from transformers import is_torch_available
-from transformers.testing_utils import (
-    require_torch,
-    require_torch_accelerator,
-    require_torch_fp16,
-    slow,
-    torch_device,
-)
-from transformers.trainer_utils import set_seed
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import JukeboxModel, JukeboxPrior, JukeboxTokenizer
-
-
-@require_torch
-class Jukebox1bModelTester(unittest.TestCase):
-    all_model_classes = (JukeboxModel,) if is_torch_available() else ()
-    model_id = "openai/jukebox-1b-lyrics"
-    metas = {
-        "artist": "Zac Brown Band",
-        "genres": "Country",
-        "lyrics": """I met a traveller from an antique land,
-    Who said "Two vast and trunkless legs of stone
-    Stand in the desert. . . . Near them, on the sand,
-    Half sunk a shattered visage lies, whose frown,
-    And wrinkled lip, and sneer of cold command,
-    Tell that its sculptor well those passions read
-    Which yet survive, stamped on these lifeless things,
-    The hand that mocked them, and the heart that fed;
-    And on the pedestal, these words appear:
-    My name is Ozymandias, King of Kings;
-    Look on my Works, ye Mighty, and despair!
-    Nothing beside remains. Round the decay
-    Of that colossal Wreck, boundless and bare
-    The lone and level sands stretch far away
-    """,
-    }
-    # fmt: off
-    EXPECTED_OUTPUT_2 = [
-        1864, 1536, 1213, 1870, 1357, 1536, 519, 880, 1323, 789, 1082, 534,
-        1000, 1445, 1105, 1130, 967, 515, 1434, 1620, 534, 1495, 283, 1445,
-        333, 1307, 539, 1631, 1528, 375, 1434, 673, 627, 710, 778, 1883,
-        1405, 1276, 1455, 1228
-    ]
-
-    EXPECTED_OUTPUT_2_PT_2 = [
-        1489, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653
-    ]
-
-    EXPECTED_OUTPUT_1 = [
-        1125, 1751, 697, 1776, 1141, 1476, 391, 697, 1125, 684, 867, 416,
-        844, 1372, 1274, 717, 1274, 844, 1299, 1419, 697, 1370, 317, 1125,
-        191, 1440, 1370, 1440, 1370, 282, 1621, 1370, 368, 349, 867, 1872,
-        1262, 869, 1728, 747
-    ]
-    EXPECTED_OUTPUT_1_PT_2 = [
-        416, 416, 1125, 1125, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416
-    ]
-
-    EXPECTED_OUTPUT_0 = [
-        1755, 842, 307, 1843, 1022, 1395, 234, 1554, 806, 739, 1022, 442,
-        616, 556, 268, 1499, 933, 457, 1440, 1837, 755, 985, 308, 902,
-        293, 1443, 1671, 1141, 1533, 555, 1562, 1061, 287, 417, 1022, 2008,
-        1186, 1015, 1777, 268
-    ]
-    EXPECTED_OUTPUT_0_PT_2 = [
-        854, 842, 1353, 114, 1353, 842, 185, 842, 185, 114, 591, 842,
-        185, 417, 185, 842, 307, 842, 591, 842, 185, 842, 307, 842,
-        591, 842, 1353, 842, 185, 842, 591, 842, 591, 114, 591, 842,
-        185, 842, 591, 89
-    ]
-
-    EXPECTED_Y_COND = [1058304, 0, 786432, 7169, 507, 76, 27, 40, 30, 76]
-
-    EXPECTED_PRIMED_0 = [
-        390, 1160, 1002, 1907, 1788, 1788, 1788, 1907, 1002, 1002, 1854, 1002,
-        1002, 1002, 1002, 1002, 1002, 1160, 1160, 1606, 596, 596, 1160, 1002,
-        1516, 596, 1002, 1002, 1002, 1907, 1788, 1788, 1788, 1854, 1788, 1907,
-        1907, 1788, 596, 1626
-    ]
-    EXPECTED_PRIMED_1 = [
-        1236, 1668, 1484, 1920, 1848, 1409, 139, 864, 1828, 1272, 1599, 824,
-        1672, 139, 555, 1484, 824, 1920, 555, 596, 1579, 1599, 1231, 1599,
-        1637, 1407, 212, 824, 1599, 116, 1433, 824, 258, 1599, 1433, 1895,
-        1063, 1433, 1433, 1599
-    ]
-    EXPECTED_PRIMED_2 = [
-        1684, 1873, 1119, 1189, 395, 611, 1901, 972, 890, 1337, 1392, 1927,
-        96, 972, 672, 780, 1119, 890, 158, 771, 1073, 1927, 353, 1331,
-        1269, 1459, 1333, 1645, 812, 1577, 1337, 606, 353, 981, 1466, 619,
-        197, 391, 302, 1930
-    ]
-    EXPECTED_VQVAE_ENCODE = [
-        390, 1160, 1002, 1907, 1788, 1788, 1788, 1907, 1002, 1002, 1854, 1002,
-        1002, 1002, 1002, 1002, 1002, 1160, 1160, 1606, 596, 596, 1160, 1002,
-        1516, 596, 1002, 1002, 1002, 1907, 1788, 1788, 1788, 1854, 1788, 1907,
-        1907, 1788, 596, 1626
-    ]
-    EXPECTED_VQVAE_DECODE = [
-        -0.0492, -0.0524, -0.0565, -0.0640, -0.0686, -0.0684, -0.0677, -0.0664,
-        -0.0605, -0.0490, -0.0330, -0.0168, -0.0083, -0.0075, -0.0051, 0.0025,
-        0.0136, 0.0261, 0.0386, 0.0497, 0.0580, 0.0599, 0.0583, 0.0614,
-        0.0740, 0.0889, 0.1023, 0.1162, 0.1211, 0.1212, 0.1251, 0.1336,
-        0.1502, 0.1686, 0.1883, 0.2148, 0.2363, 0.2458, 0.2507, 0.2531
-    ]
-    EXPECTED_AUDIO_COND = [
-        0.0256, -0.0544, 0.1600, -0.0032, 0.1066, 0.0825, -0.0013, 0.3440,
-        0.0210, 0.0412, -0.1777, -0.0892, -0.0164, 0.0285, -0.0613, -0.0617,
-        -0.0137, -0.0201, -0.0175, 0.0215, -0.0627, 0.0520, -0.0730, 0.0970,
-        -0.0100, 0.0442, -0.0586, 0.0207, -0.0015, -0.0082
-    ]
-    EXPECTED_META_COND = [
-        0.0415, 0.0877, 0.0022, -0.0055, 0.0751, 0.0334, 0.0324, -0.0068,
-        0.0011, 0.0017, -0.0676, 0.0655, -0.0143, 0.0399, 0.0303, 0.0743,
-        -0.0168, -0.0394, -0.1113, 0.0124, 0.0442, 0.0267, -0.0003, -0.1536,
-        -0.0116, -0.1837, -0.0180, -0.1026, -0.0777, -0.0456
-    ]
-    EXPECTED_LYRIC_COND = [
-        76, 27, 40, 30, 76, 46, 44, 47, 40, 37, 38, 31, 45, 45, 76, 38, 31, 33,
-        45, 76, 41, 32, 76, 45, 46, 41, 40, 31, 78, 76
-    ]
-    # fmt: on
-
-    def prepare_inputs(self):
-        tokenizer = JukeboxTokenizer.from_pretrained(self.model_id)
-        tokens = tokenizer(**self.metas)["input_ids"]
-        return tokens
-
-    @slow
-    def test_sampling(self):
-        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
-        labels = self.prepare_inputs()
-
-        set_seed(0)
-        zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
-        zs = model._sample(zs, labels, [0], sample_length=40 * model.priors[0].raw_to_tokens, save_results=False)
-        self.assertIn(zs[0][0].detach().cpu().tolist(), [self.EXPECTED_OUTPUT_2, self.EXPECTED_OUTPUT_2_PT_2])
-
-        set_seed(0)
-        zs = model._sample(zs, labels, [1], sample_length=40 * model.priors[1].raw_to_tokens, save_results=False)
-        self.assertIn(zs[1][0].detach().cpu().tolist(), [self.EXPECTED_OUTPUT_1, self.EXPECTED_OUTPUT_1_PT_2])
-
-        set_seed(0)
-        zs = model._sample(zs, labels, [2], sample_length=40 * model.priors[2].raw_to_tokens, save_results=False)
-        self.assertIn(zs[2][0].detach().cpu().tolist(), [self.EXPECTED_OUTPUT_0, self.EXPECTED_OUTPUT_0_PT_2])
-
-    @slow
-    def test_conditioning(self):
-        torch.backends.cuda.matmul.allow_tf32 = False
-        torch.backends.cudnn.allow_tf32 = False
-        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
-
-        labels = self.prepare_inputs()
-        set_seed(0)
-        zs = [torch.zeros(1, 0, dtype=torch.long) for _ in range(3)]
-
-        top_prior = model.priors[0]
-        start = 0
-        music_token_conds = top_prior.get_music_tokens_conds(zs, start=start, end=start + top_prior.n_ctx)
-        metadata = top_prior.get_metadata(labels[0].clone(), start, 1058304, 0)
-
-        self.assertIsNone(music_token_conds)
-        self.assertListEqual(metadata.numpy()[0][:10].tolist(), self.EXPECTED_Y_COND)
-
-        audio_conditioning, metadata_conditioning, lyric_tokens = top_prior.get_cond(music_token_conds, metadata)
-        torch.testing.assert_close(
-            audio_conditioning[0][0][:30].detach(), torch.tensor(self.EXPECTED_AUDIO_COND), atol=1e-4, rtol=1e-4
-        )
-        torch.testing.assert_close(
-            metadata_conditioning[0][0][:30].detach(), torch.tensor(self.EXPECTED_META_COND), atol=1e-4, rtol=1e-4
-        )
-        torch.testing.assert_close(
-            lyric_tokens[0, :30].detach(), torch.tensor(self.EXPECTED_LYRIC_COND), atol=1e-4, rtol=1e-4
-        )
-
-    @slow
-    def test_primed_sampling(self):
-        torch.backends.cuda.matmul.allow_tf32 = False
-        torch.backends.cudnn.allow_tf32 = False
-
-        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
-        set_seed(0)
-        waveform = torch.rand((1, 5120, 1))
-        tokens = list(self.prepare_inputs())
-
-        zs = [model.vqvae.encode(waveform, start_level=2, bs_chunks=waveform.shape[0])[0], None, None]
-        zs = model._sample(
-            zs, tokens, sample_levels=[0], save_results=False, sample_length=40 * model.priors[0].raw_to_tokens
-        )
-        torch.testing.assert_close(zs[0][0][:40], torch.tensor(self.EXPECTED_PRIMED_0))
-
-        upper_2 = torch.cat((zs[0], torch.zeros(1, 2048 - zs[0].shape[-1])), dim=-1).long()
-        zs = [upper_2, model.vqvae.encode(waveform, start_level=1, bs_chunks=waveform.shape[0])[0], None]
-        zs = model._sample(
-            zs, tokens, sample_levels=[1], save_results=False, sample_length=40 * model.priors[1].raw_to_tokens
-        )
-        torch.testing.assert_close(zs[1][0][:40], torch.tensor(self.EXPECTED_PRIMED_1))
-
-        upper_1 = torch.cat((zs[1], torch.zeros(1, 2048 - zs[1].shape[-1])), dim=-1).long()
-        zs = [upper_2, upper_1, model.vqvae.encode(waveform, start_level=0, bs_chunks=waveform.shape[0])[0]]
-        zs = model._sample(
-            zs, tokens, sample_levels=[2], save_results=False, sample_length=40 * model.priors[2].raw_to_tokens
-        )
-        torch.testing.assert_close(zs[2][0][:40].cpu(), torch.tensor(self.EXPECTED_PRIMED_2))
-
-    @slow
-    def test_vqvae(self):
-        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
-        set_seed(0)
-        x = torch.rand((1, 5120, 1))
-        with torch.no_grad():
-            zs = model.vqvae.encode(x, start_level=2, bs_chunks=x.shape[0])
-        torch.testing.assert_close(zs[0][0], torch.tensor(self.EXPECTED_VQVAE_ENCODE))
-
-        with torch.no_grad():
-            x = model.vqvae.decode(zs, start_level=2, bs_chunks=x.shape[0])
-        torch.testing.assert_close(x[0, :40, 0], torch.tensor(self.EXPECTED_VQVAE_DECODE), atol=1e-4, rtol=1e-4)
-
-
-@require_torch
-class Jukebox5bModelTester(unittest.TestCase):
-    all_model_classes = (JukeboxModel,) if is_torch_available() else ()
-    model_id = "openai/jukebox-5b-lyrics"
-    metas = {
-        "artist": "Zac Brown Band",
-        "genres": "Country",
-        "lyrics": """I met a traveller from an antique land,
-    Who said "Two vast and trunkless legs of stone
-    Stand in the desert. . . . Near them, on the sand,
-    Half sunk a shattered visage lies, whose frown,
-    And wrinkled lip, and sneer of cold command,
-    Tell that its sculptor well those passions read
-    Which yet survive, stamped on these lifeless things,
-    The hand that mocked them, and the heart that fed;
-    And on the pedestal, these words appear:
-    My name is Ozymandias, King of Kings;
-    Look on my Works, ye Mighty, and despair!
-    Nothing beside remains. Round the decay
-    Of that colossal Wreck, boundless and bare
-    The lone and level sands stretch far away
-    """,
-    }
-
-    # fmt: off
-    EXPECTED_OUTPUT_2 = [
-        1489, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        1489, 1489, 1489, 1489, 1150, 1853, 1509, 1150, 1357, 1509, 6, 1272
-    ]
-    EXPECTED_OUTPUT_2_PT_2 = [
-        1489, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653
-    ]
-
-    EXPECTED_OUTPUT_1 = [
-        1125, 416, 1125, 1125, 1125, 1125, 1125, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416
-    ]
-    EXPECTED_OUTPUT_1_PT_2 = [
-        416, 416, 1125, 1125, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416
-    ]
-
-    EXPECTED_OUTPUT_0 = [
-        1755, 1061, 234, 1755, 1061, 1755, 185, 290, 307, 307, 616, 616,
-        616, 616, 616, 616, 307, 290, 417, 1755, 234, 1755, 185, 290,
-        290, 290, 307, 616, 616, 616, 616, 616, 290, 234, 234, 1755,
-        234, 234, 1755, 234, 185, 185, 307, 616, 616, 616, 616, 290,
-        1755, 1755, 1755, 234, 234, 1755, 1572, 290, 307, 616, 34, 616
-    ]
-    EXPECTED_OUTPUT_0_PT_2 = [
-        854, 842, 1353, 114, 1353, 842, 185, 842, 185, 114, 591, 842, 185,
-        417, 185, 842, 307, 842, 591, 842, 185, 842, 185, 842, 591, 842,
-        1353, 842, 185, 842, 591, 842, 591, 114, 591, 842, 185, 842, 591,
-        89, 591, 842, 591, 842, 591, 417, 1372, 842, 1372, 842, 34, 842,
-        185, 89, 591, 842, 185, 842, 591, 632
-    ]
-
-    EXPECTED_GPU_OUTPUTS_2 = [
-        1489, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653
-    ]
-    EXPECTED_GPU_OUTPUTS_2_PT_2 = [
-        1489, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        653, 653, 653, 653, 653, 653, 653, 1853, 1177, 1536, 1228,
-        710, 475, 1489, 1229, 1224, 231, 1224, 252, 1434, 653, 475,
-        1106, 1877, 1599, 1228, 1600, 1683, 1182, 1853, 475, 1864,
-        252, 1229, 1434, 2001
-    ]
-
-    EXPECTED_GPU_OUTPUTS_1 = [
-        1125, 1125, 416, 1125, 1125, 416, 1125, 1125, 416, 416, 1125, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416
-    ]
-    EXPECTED_GPU_OUTPUTS_0 = [
-        491, 1755, 34, 1613, 1755, 417, 992, 1613, 222, 842, 1353, 1613,
-        844, 632, 185, 1613, 844, 632, 185, 1613, 185, 842, 677, 1613,
-        185, 114, 1353, 1613, 307, 89, 844, 1613, 307, 1332, 234, 1979,
-        307, 89, 1353, 616, 34, 842, 185, 842, 34, 842, 185, 842,
-        307, 114, 185, 89, 34, 1268, 185, 89, 34, 842, 185, 89
-    ]
-    # fmt: on
-
-    def prepare_inputs(self, model_id):
-        tokenizer = JukeboxTokenizer.from_pretrained(model_id)
-        tokens = tokenizer(**self.metas)["input_ids"]
-        return tokens
-
-    @slow
-    def test_sampling(self):
-        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
-        labels = self.prepare_inputs(self.model_id)
-
-        set_seed(0)
-        zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
-        zs = model._sample(zs, labels, [0], sample_length=60 * model.priors[0].raw_to_tokens, save_results=False)
-        self.assertIn(zs[0][0].detach().cpu().tolist(), [self.EXPECTED_OUTPUT_2, self.EXPECTED_OUTPUT_2_PT_2])
-
-        set_seed(0)
-        zs = model._sample(zs, labels, [1], sample_length=60 * model.priors[1].raw_to_tokens, save_results=False)
-        self.assertIn(zs[1][0].detach().cpu().tolist(), [self.EXPECTED_OUTPUT_1, self.EXPECTED_OUTPUT_1_PT_2])
-
-        set_seed(0)
-        zs = model._sample(zs, labels, [2], sample_length=60 * model.priors[2].raw_to_tokens, save_results=False)
-        self.assertIn(zs[2][0].detach().cpu().tolist(), [self.EXPECTED_OUTPUT_0, self.EXPECTED_OUTPUT_0_PT_2])
-
-    @slow
-    @require_torch_accelerator
-    @skip("Not enough GPU memory on CI runners")
-    def test_slow_sampling(self):
-        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
-        labels = [i.to(torch_device) for i in self.prepare_inputs(self.model_id)]
-
-        set_seed(0)
-        model.priors[0].to(torch_device)
-        zs = [torch.zeros(1, 0, dtype=torch.long).to(torch_device) for _ in range(3)]
-        zs = model._sample(zs, labels, [0], sample_length=60 * model.priors[0].raw_to_tokens, save_results=False)
-        torch.testing.assert_close(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_2))
-        model.priors[0].cpu()
-
-        set_seed(0)
-        model.priors[1].to(torch_device)
-        zs = model._sample(zs, labels, [1], sample_length=60 * model.priors[1].raw_to_tokens, save_results=False)
-        torch.testing.assert_close(zs[1][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_1))
-        model.priors[1].cpu()
-
-        set_seed(0)
-        model.priors[2].to(torch_device)
-        zs = model._sample(zs, labels, [2], sample_length=60 * model.priors[2].raw_to_tokens, save_results=False)
-        torch.testing.assert_close(zs[2][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_0))
-
-    @slow
-    @require_torch_accelerator
-    @require_torch_fp16
-    def test_fp16_slow_sampling(self):
-        prior_id = "ArthurZ/jukebox_prior_0"
-        model = JukeboxPrior.from_pretrained(prior_id, min_duration=0).eval().half().to(torch_device)
-
-        labels = self.prepare_inputs(prior_id)[0].to(torch_device)
-        metadata = model.get_metadata(labels, 0, 7680, 0)
-        set_seed(0)
-        outputs = model.sample(1, metadata=metadata, sample_tokens=60)
-        self.assertIn(outputs[0].cpu().tolist(), [self.EXPECTED_GPU_OUTPUTS_2, self.EXPECTED_GPU_OUTPUTS_2_PT_2])
diff --git a/tests/models/jukebox/test_tokenization_jukebox.py b/tests/models/jukebox/test_tokenization_jukebox.py
deleted file mode 100644
index c434cf6aa17f79..00000000000000
--- a/tests/models/jukebox/test_tokenization_jukebox.py
+++ /dev/null
@@ -1,209 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from transformers import JukeboxTokenizer
-from transformers.testing_utils import require_torch
-
-
-class JukeboxTokenizationTest(unittest.TestCase):
-    tokenizer_class = JukeboxTokenizer
-    metas = {
-        "artist": "Zac Brown Band",
-        "genres": "Country",
-        "lyrics": """I met a traveller from an antique land,
-        Who said "Two vast and trunkless legs of stone
-        Stand in the desert. . . . Near them, on the sand,
-        Half sunk a shattered visage lies, whose frown,
-        And wrinkled lip, and sneer of cold command,
-        Tell that its sculptor well those passions read
-        Which yet survive, stamped on these lifeless things,
-        The hand that mocked them, and the heart that fed;
-        And on the pedestal, these words appear:
-        My name is Ozymandias, King of Kings;
-        Look on my Works, ye Mighty, and despair!
-        Nothing beside remains. Round the decay
-        Of that colossal Wreck, boundless and bare
-        The lone and level sands stretch far away
-        """,
-    }
-
-    @require_torch
-    def test_1b_lyrics_tokenizer(self):
-        """
-        how to run the same test with openAI
-        ...
-        """
-        import torch
-
-        tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
-        tokens = tokenizer(**self.metas)["input_ids"]
-        # fmt: off
-        EXPECTED_OUTPUT = [
-            torch.tensor([[
-                0, 0, 0, 7169, 507, 9, 76, 39, 31, 46, 76, 27,
-                76, 46, 44, 27, 48, 31, 38, 38, 31, 44, 76, 32,
-                44, 41, 39, 76, 27, 40, 76, 27, 40, 46, 35, 43,
-                47, 31, 76, 38, 27, 40, 30, 64, 78, 76, 76, 76,
-                76, 76, 76, 76, 76, 23, 34, 41, 76, 45, 27, 35,
-                30, 76, 71, 20, 49, 41, 76, 48, 27, 45, 46, 76,
-                27, 40, 30, 76, 46, 44, 47, 40, 37, 38, 31, 45,
-                45, 76, 38, 31, 33, 45, 76, 41, 32, 76, 45, 46,
-                41, 40, 31, 78, 76, 76, 76, 76, 76, 76, 76, 76,
-                19, 46, 27, 40, 30, 76, 35, 40, 76, 46, 34, 31,
-                76, 30, 31, 45, 31, 44, 46, 63, 76, 63, 76, 63,
-                76, 63, 76, 14, 31, 27, 44, 76, 46, 34, 31, 39,
-                64, 76, 41, 40, 76, 46, 34, 31, 76, 45, 27, 40,
-                30, 64, 78, 76, 76, 76, 76, 76, 76, 76, 76, 8,
-                27, 38, 32, 76, 45, 47, 40, 37, 76, 27, 76, 45,
-                34, 27, 46, 46, 31, 44, 31, 30, 76, 48, 35, 45,
-                27, 33, 31, 76, 38, 35, 31, 45, 64, 76, 49, 34,
-                41, 45, 31, 76, 32, 44, 41, 49, 40, 64, 78, 76,
-                76, 76, 76, 76, 76, 76, 76, 1, 40, 30, 76, 49,
-                44, 35, 40, 37, 38, 31, 30, 76, 38, 35, 42, 64,
-                76, 27, 40, 30, 76, 45, 40, 31, 31, 44, 76, 41,
-                32, 76, 29, 41, 38, 30, 76, 29, 41, 39, 39, 27,
-                40, 30, 64, 78, 76, 76, 76, 76, 76, 76, 76, 76,
-                20, 31, 38, 38, 76, 46, 34, 27, 46, 76, 35, 46,
-                45, 76, 45, 29, 47, 38, 42, 46, 41, 44, 76, 49,
-                31, 38, 38, 76, 46, 34, 41, 45, 31, 76, 42, 27,
-                45, 45, 35, 41, 40, 45, 76, 44, 31, 27, 30, 78,
-                76, 76, 76, 76, 76, 76, 76, 76, 23, 34, 35, 29,
-                34, 76, 51, 31, 46, 76, 45, 47, 44, 48, 35, 48,
-                31, 64, 76, 45, 46, 27, 39, 42, 31, 30, 76, 41,
-                40, 76, 46, 34, 31, 45, 31, 76, 38, 35, 32, 31,
-                38, 31, 45, 45, 76, 46, 34, 35, 40, 33, 45, 64,
-                78, 76, 76, 76, 76, 76, 76, 76, 76, 20, 34, 31,
-                76, 34, 27, 40, 30, 76, 46, 34, 27, 46, 76, 39,
-                41, 29, 37, 31, 30, 76, 46, 34, 31, 39, 64, 76,
-                27, 40, 30, 76, 46, 34, 31, 76, 34, 31, 27, 44,
-                46, 76, 46, 34, 27, 46, 76, 32, 31, 30, 66, 78,
-                76, 76, 76, 76, 76, 76, 76, 76, 1, 40, 30, 76,
-                41, 40, 76, 46, 34, 31, 76, 42, 31, 30, 31, 45,
-                46, 27, 38, 64, 76, 46, 34, 31, 45, 31, 76, 49,
-                41, 44, 30, 45, 76, 27, 42, 42, 31, 27, 44, 65,
-                78, 76, 76, 76, 76, 76, 76, 76, 76, 13, 51, 76,
-                40, 27, 39, 31, 76, 35, 45, 76, 15, 52, 51, 39,
-                27, 40, 30, 35, 27, 45, 64, 76, 11, 35, 40, 33,
-                76, 41, 32, 76, 11, 35, 40, 33, 45, 66, 78, 76,
-                76, 76, 76, 76, 76, 76, 76, 12, 41, 41, 37, 76,
-                41, 40, 76, 39, 51, 76, 23, 41, 44, 37, 45, 64,
-                76, 51, 31, 76, 13, 35, 33, 34, 46, 51, 64, 76,
-                27, 40, 30, 76, 30, 31, 45, 42, 27, 35, 44, 67,
-                78, 76, 76, 76, 76, 76, 76, 76, 76, 14, 41, 46,
-                34, 35, 40, 33, 76, 28, 31, 45, 35, 30, 31, 76,
-                44, 31, 39, 27, 35, 40, 45, 63, 76, 18, 41, 47,
-                40, 30, 76, 46, 34, 31, 76, 30, 31, 29, 27, 51,
-                78, 76, 76, 76, 76, 76, 76, 76, 76, 15, 32, 76,
-                46, 34, 27, 46, 76, 29, 41, 38, 41, 45, 45, 27,
-                38, 76, 23, 44, 31, 29, 37, 64, 76, 28, 41, 47,
-                40, 30, 38, 31, 45, 45, 76, 27, 40, 30, 76, 28,
-                27, 44, 31, 78, 76, 76, 76, 76, 76, 76, 76, 76,
-                20, 34, 31, 76, 38, 41, 40, 31, 76, 27, 40, 30,
-                76, 38, 31, 48, 31, 38, 76, 45, 27, 40, 30, 45,
-                76, 45, 46, 44, 31, 46, 29, 34, 76, 32, 27, 44,
-                76, 27, 49, 27, 51, 78, 76, 76, 76, 76, 76, 76,
-                76, 76]]),
-            torch.tensor([[0, 0, 0, 1069, 11]]),
-            torch.tensor([[0, 0, 0, 1069, 11]]),
-        ]
-        # fmt: on
-        self.assertTrue(torch.allclose(tokens[0], EXPECTED_OUTPUT[0]))
-        self.assertTrue(torch.allclose(tokens[1], EXPECTED_OUTPUT[1]))
-        self.assertTrue(torch.allclose(tokens[2], EXPECTED_OUTPUT[2]))
-
-    @require_torch
-    def test_5b_lyrics_tokenizer(self):
-        """
-        The outputs are similar that open AI but do not have the same format as this one is adapted to the HF integration.
-        """
-        import torch
-
-        tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-5b-lyrics")
-        tokens = tokenizer(**self.metas)["input_ids"]
-        # fmt: off
-        EXPECTED_OUTPUT = [
-            torch.tensor([[
-                0, 0, 0, 1069, 11, -1, -1, -1, -1, 9, 77, 39,
-                31, 46, 77, 27, 77, 46, 44, 27, 48, 31, 38, 38,
-                31, 44, 77, 32, 44, 41, 39, 77, 27, 40, 77, 27,
-                40, 46, 35, 43, 47, 31, 77, 38, 27, 40, 30, 64,
-                79, 77, 77, 77, 77, 77, 77, 77, 77, 23, 34, 41,
-                77, 45, 27, 35, 30, 77, 72, 20, 49, 41, 77, 48,
-                27, 45, 46, 77, 27, 40, 30, 77, 46, 44, 47, 40,
-                37, 38, 31, 45, 45, 77, 38, 31, 33, 45, 77, 41,
-                32, 77, 45, 46, 41, 40, 31, 79, 77, 77, 77, 77,
-                77, 77, 77, 77, 19, 46, 27, 40, 30, 77, 35, 40,
-                77, 46, 34, 31, 77, 30, 31, 45, 31, 44, 46, 63,
-                77, 63, 77, 63, 77, 63, 77, 14, 31, 27, 44, 77,
-                46, 34, 31, 39, 64, 77, 41, 40, 77, 46, 34, 31,
-                77, 45, 27, 40, 30, 64, 79, 77, 77, 77, 77, 77,
-                77, 77, 77, 8, 27, 38, 32, 77, 45, 47, 40, 37,
-                77, 27, 77, 45, 34, 27, 46, 46, 31, 44, 31, 30,
-                77, 48, 35, 45, 27, 33, 31, 77, 38, 35, 31, 45,
-                64, 77, 49, 34, 41, 45, 31, 77, 32, 44, 41, 49,
-                40, 64, 79, 77, 77, 77, 77, 77, 77, 77, 77, 1,
-                40, 30, 77, 49, 44, 35, 40, 37, 38, 31, 30, 77,
-                38, 35, 42, 64, 77, 27, 40, 30, 77, 45, 40, 31,
-                31, 44, 77, 41, 32, 77, 29, 41, 38, 30, 77, 29,
-                41, 39, 39, 27, 40, 30, 64, 79, 77, 77, 77, 77,
-                77, 77, 77, 77, 20, 31, 38, 38, 77, 46, 34, 27,
-                46, 77, 35, 46, 45, 77, 45, 29, 47, 38, 42, 46,
-                41, 44, 77, 49, 31, 38, 38, 77, 46, 34, 41, 45,
-                31, 77, 42, 27, 45, 45, 35, 41, 40, 45, 77, 44,
-                31, 27, 30, 79, 77, 77, 77, 77, 77, 77, 77, 77,
-                23, 34, 35, 29, 34, 77, 51, 31, 46, 77, 45, 47,
-                44, 48, 35, 48, 31, 64, 77, 45, 46, 27, 39, 42,
-                31, 30, 77, 41, 40, 77, 46, 34, 31, 45, 31, 77,
-                38, 35, 32, 31, 38, 31, 45, 45, 77, 46, 34, 35,
-                40, 33, 45, 64, 79, 77, 77, 77, 77, 77, 77, 77,
-                77, 20, 34, 31, 77, 34, 27, 40, 30, 77, 46, 34,
-                27, 46, 77, 39, 41, 29, 37, 31, 30, 77, 46, 34,
-                31, 39, 64, 77, 27, 40, 30, 77, 46, 34, 31, 77,
-                34, 31, 27, 44, 46, 77, 46, 34, 27, 46, 77, 32,
-                31, 30, 66, 79, 77, 77, 77, 77, 77, 77, 77, 77,
-                1, 40, 30, 77, 41, 40, 77, 46, 34, 31, 77, 42,
-                31, 30, 31, 45, 46, 27, 38, 64, 77, 46, 34, 31,
-                45, 31, 77, 49, 41, 44, 30, 45, 77, 27, 42, 42,
-                31, 27, 44, 65, 79, 77, 77, 77, 77, 77, 77, 77,
-                77, 13, 51, 77, 40, 27, 39, 31, 77, 35, 45, 77,
-                15, 52, 51, 39, 27, 40, 30, 35, 27, 45, 64, 77,
-                11, 35, 40, 33, 77, 41, 32, 77, 11, 35, 40, 33,
-                45, 66, 79, 77, 77, 77, 77, 77, 77, 77, 77, 12,
-                41, 41, 37, 77, 41, 40, 77, 39, 51, 77, 23, 41,
-                44, 37, 45, 64, 77, 51, 31, 77, 13, 35, 33, 34,
-                46, 51, 64, 77, 27, 40, 30, 77, 30, 31, 45, 42,
-                27, 35, 44, 67, 79, 77, 77, 77, 77, 77, 77, 77,
-                77, 14, 41, 46, 34, 35, 40, 33, 77, 28, 31, 45,
-                35, 30, 31, 77, 44, 31, 39, 27, 35, 40, 45, 63,
-                77, 18, 41, 47, 40, 30, 77, 46, 34, 31, 77, 30,
-                31, 29, 27, 51, 79, 77, 77, 77, 77, 77, 77, 77,
-                77, 15, 32, 77, 46, 34, 27, 46, 77, 29, 41, 38,
-                41, 45, 45, 27, 38, 77, 23, 44, 31, 29, 37, 64,
-                77, 28, 41, 47, 40, 30, 38, 31, 45, 45, 77, 27,
-                40, 30, 77, 28, 27, 44, 31, 79, 77, 77, 77, 77,
-                77, 77, 77, 77, 20, 34, 31, 77, 38, 41, 40, 31,
-                77, 27, 40, 30, 77, 38, 31, 48, 31, 38, 77, 45,
-                27, 40, 30, 45, 77, 45, 46, 44, 31, 46, 29, 34,
-                77, 32, 27, 44, 77, 27, 49, 27, 51, 79, 77, 77,
-                77, 77, 77, 77, 77, 77]]),
-            torch.tensor([[0, 0, 0, 1069, 11, -1, -1, -1, -1]]),
-            torch.tensor([[0, 0, 0, 1069, 11, -1, -1, -1, -1]]),
-        ]
-        # fmt: on
-        self.assertTrue(torch.allclose(tokens[0], EXPECTED_OUTPUT[0]))
-        self.assertTrue(torch.allclose(tokens[1], EXPECTED_OUTPUT[1]))
-        self.assertTrue(torch.allclose(tokens[2], EXPECTED_OUTPUT[2]))
diff --git a/tests/models/mega/__init__.py b/tests/models/mega/__init__.py
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/tests/models/mega/test_modeling_mega.py b/tests/models/mega/test_modeling_mega.py
deleted file mode 100644
index 872f0a38af8e8f..00000000000000
--- a/tests/models/mega/test_modeling_mega.py
+++ /dev/null
@@ -1,744 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import MegaConfig, is_torch_available
-from transformers.testing_utils import (
-    TestCasePlus,
-    is_flaky,
-    require_torch,
-    require_torch_fp16,
-    slow,
-    torch_device,
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        MegaForCausalLM,
-        MegaForMaskedLM,
-        MegaForMultipleChoice,
-        MegaForQuestionAnswering,
-        MegaForSequenceClassification,
-        MegaForTokenClassification,
-        MegaModel,
-    )
-
-
-class MegaModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        intermediate_size=37,
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_positions=1024,
-        bidirectional=False,  # needed for decoding, and can't modify common generation tests; test separately by overriding
-        ema_projection_size=16,
-        shared_representation_size=64,
-        use_chunking=False,
-        chunk_size=32,
-        attention_activation="softmax",
-        use_normalized_ffn=True,
-        nffn_hidden_size=24,
-        add_token_type_embeddings=True,
-        type_vocab_size=2,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.add_token_type_embeddings = add_token_type_embeddings
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.intermediate_size = intermediate_size
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_positions = max_positions
-        self.bidirectional = bidirectional
-        self.ema_projection_size = ema_projection_size
-        self.shared_representation_size = shared_representation_size
-        self.use_chunking = use_chunking
-        self.chunk_size = chunk_size
-        self.attention_activation = attention_activation
-        self.use_normalized_ffn = use_normalized_ffn
-        self.nffn_hidden_size = nffn_hidden_size
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-        self.num_attention_heads = 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.add_token_type_embeddings:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return MegaConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            intermediate_size=self.intermediate_size,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            # added args
-            add_token_type_embeddings=self.add_token_type_embeddings,
-            max_positions=self.max_positions,
-            bidirectional=self.bidirectional,
-            ema_projection_size=self.ema_projection_size,
-            shared_representation_size=self.shared_representation_size,
-            use_chunking=self.use_chunking,
-            chunk_size=self.chunk_size,
-            attention_activation=self.attention_activation,
-            use_normalized_ffn=self.use_normalized_ffn,
-            nffn_hidden_size=self.nffn_hidden_size,
-        )
-
-    def get_pipeline_config(self):
-        config = self.get_config()
-        config.vocab_size = 300
-        return config
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        config.bidirectional = False
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = MegaModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = MegaModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = MegaForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.bidirectional = False
-        config.add_cross_attention = True
-        model = MegaForCausalLM(config=config).to(torch_device).eval()
-
-        # make sure that ids don't start with pad token
-        mask = input_ids.ne(config.pad_token_id).long()
-        input_ids = input_ids * mask
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # make sure that ids don't start with pad token
-        mask = next_tokens.ne(config.pad_token_id).long()
-        next_tokens = next_tokens * mask
-        next_mask = ids_tensor((self.batch_size, 1), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_decoder_model_with_chunking(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.use_chunking = True
-        config.output_attentions = True
-        config.attention_activation = "laplace"
-        config.chunk_size = input_ids.size(1) * 2
-
-        model = MegaForCausalLM(config).to(torch_device).eval()
-
-        input_ids = input_ids.repeat(1, 8)
-        # multiply the sequence length by 8 since we repeat the same ids 8 times in input_ids
-        input_mask = random_attention_mask([self.batch_size, self.seq_length * 8])
-
-        result = model(input_ids, attention_mask=input_mask)
-
-        # test if the sequence length of attentions is same provided chunk_size
-        self.parent.assertEqual(result["attentions"][0].shape[-1], config.chunk_size)
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = MegaForMaskedLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = MegaForTokenClassification(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = MegaForMultipleChoice(config=config)
-        model.to(torch_device)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = MegaForQuestionAnswering(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    # extra checks for Mega-specific model functionality
-    def create_and_check_bidirectionality(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.bidirectional = True
-        model = MegaModel(config)
-        model.to(torch_device)
-        model.eval()
-        # no mask
-        result = model(input_ids)
-        # with mask & token types
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-
-        self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def check_chunking_shorter_sequence(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.use_chunking = True
-        config.chunk_size = input_ids.size(1) + 25
-        model = MegaModel(config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-
-        self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def check_chunking_longer_sequence(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.use_chunking = True
-
-        # we want the chunk size to be < sequence length, and the sequence length to be a multiple of chunk size
-        config.chunk_size = input_ids.size(1) * 2
-        model = MegaModel(config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(
-            input_ids.repeat(1, 8),
-        )
-
-        self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length * 8, self.hidden_size))
-
-    def check_laplace_self_attention(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.attention_activation = "laplace"
-        model = MegaModel(config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-
-        self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def check_relu2_self_attention(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.attention_activation = "relu2"
-        model = MegaModel(config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-
-        self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def check_sequence_length_beyond_max_positions(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.max_positions = self.seq_length - 2
-        model = MegaModel(config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-
-        self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class MegaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            MegaForCausalLM,
-            MegaForMaskedLM,
-            MegaModel,
-            MegaForSequenceClassification,
-            MegaForTokenClassification,
-            MegaForMultipleChoice,
-            MegaForQuestionAnswering,
-        )
-        if is_torch_available()
-        else ()
-    )
-    all_generative_model_classes = (MegaForCausalLM,) if is_torch_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": MegaModel,
-            "fill-mask": MegaForMaskedLM,
-            "question-answering": MegaForQuestionAnswering,
-            "text-classification": MegaForSequenceClassification,
-            "text-generation": MegaForCausalLM,
-            "token-classification": MegaForTokenClassification,
-            "zero-shot": MegaForSequenceClassification,
-        }
-        if is_torch_available()
-        else {}
-    )
-
-    fx_compatible = False
-    test_head_masking = False
-    test_pruning = False
-
-    def setUp(self):
-        self.model_tester = MegaModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MegaConfig, hidden_size=37)
-
-    # TODO: @ydshieh
-    @is_flaky(description="Sometimes gives `AssertionError` on expected outputs")
-    def test_pipeline_fill_mask(self):
-        super().test_pipeline_fill_mask()
-
-    # TODO: @ydshieh
-    @is_flaky(
-        description="Sometimes gives `RuntimeError: probability tensor contains either `inf`, `nan` or element < 0`"
-    )
-    def test_pipeline_text_generation(self):
-        super().test_pipeline_text_generation()
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def test_for_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_decoder_model_with_chunking(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_with_chunking(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_bidirectionality(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_bidirectionality(*config_and_inputs)
-
-    def test_for_chunking_shorter_sequence(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_chunking_shorter_sequence(*config_and_inputs)
-
-    def test_for_chunking_longer_sequence(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_chunking_longer_sequence(*config_and_inputs)
-
-    def test_for_laplace_attention(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_laplace_self_attention(*config_and_inputs)
-
-    def test_for_relu2_attention(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_relu2_self_attention(*config_and_inputs)
-
-    def test_for_sequence_length_beyond_max_positions(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_sequence_length_beyond_max_positions(*config_and_inputs)
-
-    @require_torch_fp16
-    def test_generate_fp16(self):
-        config, input_ids, _, attention_mask, *_ = self.model_tester.prepare_config_and_inputs_for_decoder()
-        # attention_mask = torch.LongTensor(input_ids.ne(1)).to(torch_device)
-        model = MegaForCausalLM(config).eval().to(torch_device)
-        model.half()
-        model.generate(input_ids, attention_mask=attention_mask)
-        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
-
-    def test_sequence_classification_model(self):
-        config, input_ids, _, attention_mask, *_ = self.model_tester.prepare_config_and_inputs()
-        config.num_labels = self.model_tester.num_labels
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = MegaForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_sequence_classification_model_for_multi_label(self):
-        config, input_ids, _, attention_mask, *_ = self.model_tester.prepare_config_and_inputs()
-        config.num_labels = self.model_tester.num_labels
-        config.problem_type = "multi_label_classification"
-        sequence_labels = ids_tensor(
-            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
-        ).to(torch.float)
-        model = MegaForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "mnaylor/mega-base-wikitext"
-        model = MegaModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
-    def test_cpu_offload(self):
-        super().test_cpu_offload()
-
-    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
-    def test_disk_offload(self):
-        super().test_disk_offload()
-
-    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
-    def test_model_parallelism(self):
-        super().test_model_parallelism()
-
-    @unittest.skip(
-        reason=(
-            "Calling `self.attention_function` in `MegaMovingAverageGatedAttention.forward` changes the submodules on "
-            "device 1 to device 0 (also changes `requires_grad`). No idea how this could happen for now."
-        )
-    )
-    def test_multi_gpu_data_parallel_forward(self):
-        super().test_multi_gpu_data_parallel_forward()
-
-    @unittest.skip(reason="Tracing of the dynamically computed `MegaMultiDimensionDampedEma._kernel` doesn't work.")
-    def test_torchscript_simple(self):
-        super().test_torchscript_simple()
-
-    @unittest.skip(reason="Tracing of the dynamically computed `MegaMultiDimensionDampedEma._kernel` doesn't work.")
-    def test_torchscript_output_hidden_state(self):
-        super().test_torchscript_output_hidden_state()
-
-    @unittest.skip(reason="Tracing of the dynamically computed `MegaMultiDimensionDampedEma._kernel` doesn't work.")
-    def test_torchscript_output_attentions(self):
-        super().test_torchscript_output_attentions()
-
-
-@require_torch
-class MegaModelIntegrationTest(TestCasePlus):
-    @slow
-    def test_inference_masked_lm(self):
-        model = MegaForMaskedLM.from_pretrained("mnaylor/mega-base-wikitext")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        with torch.no_grad():
-            output = model(input_ids)[0]
-        expected_shape = torch.Size((1, 11, 50265))
-        self.assertEqual(output.shape, expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = torch.tensor(
-            [[[67.8389, 10.1470, -32.7148], [-11.1655, 29.1152, 23.1304], [-3.8015, 66.0397, 29.6733]]]
-        )
-
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_no_head(self):
-        model = MegaModel.from_pretrained("mnaylor/mega-base-wikitext")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        with torch.no_grad():
-            output = model(input_ids)[0]
-        expected_shape = torch.Size((1, 11, 128))
-        self.assertEqual(output.shape, expected_shape)
-        # compare the actual values for a slice. taken from output[:, :3, :3]
-        expected_slice = torch.tensor(
-            [[[1.1767, -0.6349, 2.8494], [-0.5109, -0.7745, 1.9495], [-0.3287, -0.2111, 3.3367]]]
-        )
-
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
diff --git a/tests/models/nat/__init__.py b/tests/models/nat/__init__.py
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/tests/models/nat/test_modeling_nat.py b/tests/models/nat/test_modeling_nat.py
deleted file mode 100644
index c04472620b4f8f..00000000000000
--- a/tests/models/nat/test_modeling_nat.py
+++ /dev/null
@@ -1,382 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch Nat model."""
-
-import collections
-import unittest
-
-from transformers import NatConfig
-from transformers.testing_utils import require_natten, require_torch, require_vision, slow, torch_device
-from transformers.utils import cached_property, is_torch_available, is_vision_available
-
-from ...test_backbone_common import BackboneTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-    from torch import nn
-
-    from transformers import NatBackbone, NatForImageClassification, NatModel
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import AutoImageProcessor
-
-
-class NatModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=64,
-        patch_size=4,
-        num_channels=3,
-        embed_dim=16,
-        depths=[1, 2, 1],
-        num_heads=[2, 4, 8],
-        kernel_size=3,
-        mlp_ratio=2.0,
-        qkv_bias=True,
-        hidden_dropout_prob=0.0,
-        attention_probs_dropout_prob=0.0,
-        drop_path_rate=0.1,
-        hidden_act="gelu",
-        patch_norm=True,
-        initializer_range=0.02,
-        layer_norm_eps=1e-5,
-        is_training=True,
-        scope=None,
-        use_labels=True,
-        num_labels=10,
-        out_features=["stage1", "stage2"],
-        out_indices=[1, 2],
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.embed_dim = embed_dim
-        self.depths = depths
-        self.num_heads = num_heads
-        self.kernel_size = kernel_size
-        self.mlp_ratio = mlp_ratio
-        self.qkv_bias = qkv_bias
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.drop_path_rate = drop_path_rate
-        self.hidden_act = hidden_act
-        self.patch_norm = patch_norm
-        self.layer_norm_eps = layer_norm_eps
-        self.initializer_range = initializer_range
-        self.is_training = is_training
-        self.scope = scope
-        self.use_labels = use_labels
-        self.num_labels = num_labels
-        self.out_features = out_features
-        self.out_indices = out_indices
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.num_labels)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return NatConfig(
-            num_labels=self.num_labels,
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            embed_dim=self.embed_dim,
-            depths=self.depths,
-            num_heads=self.num_heads,
-            kernel_size=self.kernel_size,
-            mlp_ratio=self.mlp_ratio,
-            qkv_bias=self.qkv_bias,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            drop_path_rate=self.drop_path_rate,
-            hidden_act=self.hidden_act,
-            patch_norm=self.patch_norm,
-            layer_norm_eps=self.layer_norm_eps,
-            initializer_range=self.initializer_range,
-            out_features=self.out_features,
-            out_indices=self.out_indices,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = NatModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-
-        expected_height = expected_width = (config.image_size // config.patch_size) // (2 ** (len(config.depths) - 1))
-        expected_dim = int(config.embed_dim * 2 ** (len(config.depths) - 1))
-
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, expected_height, expected_width, expected_dim)
-        )
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        model = NatForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = NatForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_backbone(self, config, pixel_values, labels):
-        model = NatBackbone(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-
-        # verify hidden states
-        self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, model.channels[0], 16, 16])
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), len(config.out_features))
-
-        # verify backbone works with out_features=None
-        config.out_features = None
-        model = NatBackbone(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-
-        # verify feature maps
-        self.parent.assertEqual(len(result.feature_maps), 1)
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, model.channels[-1], 4, 4])
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), 1)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_natten
-@require_torch
-class NatModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            NatModel,
-            NatForImageClassification,
-            NatBackbone,
-        )
-        if is_torch_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {"image-feature-extraction": NatModel, "image-classification": NatForImageClassification}
-        if is_torch_available()
-        else {}
-    )
-    fx_compatible = False
-
-    test_torchscript = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = NatModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=NatConfig, embed_dim=37)
-
-    def test_config(self):
-        self.create_and_test_config_common_properties()
-        self.config_tester.create_and_test_config_to_json_string()
-        self.config_tester.create_and_test_config_to_json_file()
-        self.config_tester.create_and_test_config_from_and_save_pretrained()
-        self.config_tester.create_and_test_config_with_num_labels()
-        self.config_tester.check_config_can_be_init_without_params()
-        self.config_tester.check_config_arguments_init()
-
-    def create_and_test_config_common_properties(self):
-        return
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    def test_backbone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_backbone(*config_and_inputs)
-
-    @unittest.skip(reason="Nat does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Nat does not use feedforward chunking")
-    def test_feed_forward_chunking(self):
-        pass
-
-    def test_model_common_attributes(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_attention_outputs(self):
-        self.skipTest("Nat's attention operation is handled entirely by NATTEN.")
-
-    def check_hidden_states_output(self, inputs_dict, config, model_class, image_size):
-        model = model_class(config)
-        model.to(torch_device)
-        model.eval()
-
-        with torch.no_grad():
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-        hidden_states = outputs.hidden_states
-
-        expected_num_layers = getattr(
-            self.model_tester, "expected_num_hidden_layers", len(self.model_tester.depths) + 1
-        )
-        self.assertEqual(len(hidden_states), expected_num_layers)
-
-        # Nat has a different seq_length
-        patch_size = (
-            config.patch_size
-            if isinstance(config.patch_size, collections.abc.Iterable)
-            else (config.patch_size, config.patch_size)
-        )
-
-        height = image_size[0] // patch_size[0]
-        width = image_size[1] // patch_size[1]
-
-        self.assertListEqual(
-            list(hidden_states[0].shape[-3:]),
-            [height, width, self.model_tester.embed_dim],
-        )
-
-        if model_class.__name__ != "NatBackbone":
-            reshaped_hidden_states = outputs.reshaped_hidden_states
-            self.assertEqual(len(reshaped_hidden_states), expected_num_layers)
-
-            batch_size, num_channels, height, width = reshaped_hidden_states[0].shape
-            reshaped_hidden_states = (
-                reshaped_hidden_states[0].view(batch_size, num_channels, height, width).permute(0, 2, 3, 1)
-            )
-            self.assertListEqual(
-                list(reshaped_hidden_states.shape[-3:]),
-                [height, width, self.model_tester.embed_dim],
-            )
-
-    def test_hidden_states_output(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        image_size = (
-            self.model_tester.image_size
-            if isinstance(self.model_tester.image_size, collections.abc.Iterable)
-            else (self.model_tester.image_size, self.model_tester.image_size)
-        )
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "shi-labs/nat-mini-in1k-224"
-        model = NatModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if "embeddings" not in name and param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-
-@require_natten
-@require_vision
-@require_torch
-class NatModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224") if is_vision_available() else None
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = NatForImageClassification.from_pretrained("shi-labs/nat-mini-in1k-224").to(torch_device)
-        image_processor = self.default_image_processor
-
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = torch.Size((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-        expected_slice = torch.tensor([0.3805, -0.8676, -0.3912]).to(torch_device)
-        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
-
-
-@require_torch
-@require_natten
-class NatBackboneTest(unittest.TestCase, BackboneTesterMixin):
-    all_model_classes = (NatBackbone,) if is_torch_available() else ()
-    config_class = NatConfig
-
-    def setUp(self):
-        self.model_tester = NatModelTester(self)
diff --git a/tests/models/nezha/__init__.py b/tests/models/nezha/__init__.py
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/tests/models/nezha/test_modeling_nezha.py b/tests/models/nezha/test_modeling_nezha.py
deleted file mode 100644
index 311866758be65e..00000000000000
--- a/tests/models/nezha/test_modeling_nezha.py
+++ /dev/null
@@ -1,489 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import tempfile
-import unittest
-
-from transformers import NezhaConfig, is_torch_available
-from transformers.models.auto import get_values
-from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        MODEL_FOR_PRETRAINING_MAPPING,
-        NezhaForMaskedLM,
-        NezhaForMultipleChoice,
-        NezhaForNextSentencePrediction,
-        NezhaForPreTraining,
-        NezhaForQuestionAnswering,
-        NezhaForSequenceClassification,
-        NezhaForTokenClassification,
-        NezhaModel,
-    )
-
-
-class NezhaModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=128,
-        max_relative_position=32,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        """
-        Returns a tiny configuration by default.
-        """
-        return NezhaConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = NezhaModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = NezhaModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = NezhaForMaskedLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_next_sequence_prediction(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = NezhaForNextSentencePrediction(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=sequence_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, 2))
-
-    def create_and_check_for_pretraining(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = NezhaForPreTraining(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-            next_sentence_label=sequence_labels,
-        )
-        self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        self.parent.assertEqual(result.seq_relationship_logits.shape, (self.batch_size, 2))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = NezhaForQuestionAnswering(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = NezhaForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = NezhaForTokenClassification(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = NezhaForMultipleChoice(config=config)
-        model.to(torch_device)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class NezhaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            NezhaModel,
-            NezhaForMaskedLM,
-            NezhaForMultipleChoice,
-            NezhaForNextSentencePrediction,
-            NezhaForPreTraining,
-            NezhaForQuestionAnswering,
-            NezhaForSequenceClassification,
-            NezhaForTokenClassification,
-        )
-        if is_torch_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": NezhaModel,
-            "fill-mask": NezhaForMaskedLM,
-            "question-answering": NezhaForQuestionAnswering,
-            "text-classification": NezhaForSequenceClassification,
-            "token-classification": NezhaForTokenClassification,
-            "zero-shot": NezhaForSequenceClassification,
-        }
-        if is_torch_available()
-        else {}
-    )
-    fx_compatible = True
-
-    # special case for ForPreTraining model
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
-                inputs_dict["labels"] = torch.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
-                )
-                inputs_dict["next_sentence_label"] = torch.zeros(
-                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
-                )
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = NezhaModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=NezhaConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_next_sequence_prediction(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_next_sequence_prediction(*config_and_inputs)
-
-    def test_for_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "sijunhe/nezha-cn-base"
-        model = NezhaModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @slow
-    @require_torch_gpu
-    def test_torchscript_device_change(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            # NezhaForMultipleChoice behaves incorrectly in JIT environments.
-            if model_class == NezhaForMultipleChoice:
-                return
-
-            config.torchscript = True
-            model = model_class(config=config)
-
-            inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-            traced_model = torch.jit.trace(
-                model, (inputs_dict["input_ids"].to("cpu"), inputs_dict["attention_mask"].to("cpu"))
-            )
-
-            with tempfile.TemporaryDirectory() as tmp:
-                torch.jit.save(traced_model, os.path.join(tmp, "bert.pt"))
-                loaded = torch.jit.load(os.path.join(tmp, "bert.pt"), map_location=torch_device)
-                loaded(inputs_dict["input_ids"].to(torch_device), inputs_dict["attention_mask"].to(torch_device))
-
-
-@require_torch
-class NezhaModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_nezha_model(self):
-        model = NezhaModel.from_pretrained("sijunhe/nezha-cn-base")
-        input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
-        attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1]])
-        with torch.no_grad():
-            output = model(input_ids, attention_mask=attention_mask)[0]
-        expected_shape = torch.Size((1, 6, 768))
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = torch.tensor([[[0.0685, 0.2441, 0.1102], [0.0600, 0.1906, 0.1349], [0.0221, 0.0819, 0.0586]]])
-
-        self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_nezha_masked_lm(self):
-        model = NezhaForMaskedLM.from_pretrained("sijunhe/nezha-cn-base")
-        input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
-        attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1]])
-        with torch.no_grad():
-            output = model(input_ids, attention_mask=attention_mask)[0]
-        expected_shape = torch.Size((1, 6, 21128))
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = torch.tensor(
-            [[-2.7939, -1.7902, -2.2189], [-2.8585, -1.8908, -2.3723], [-2.6499, -1.7750, -2.2558]]
-        )
-
-        self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
diff --git a/tests/models/qdqbert/__init__.py b/tests/models/qdqbert/__init__.py
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/tests/models/qdqbert/test_modeling_qdqbert.py b/tests/models/qdqbert/test_modeling_qdqbert.py
deleted file mode 100644
index 96e63834ec435a..00000000000000
--- a/tests/models/qdqbert/test_modeling_qdqbert.py
+++ /dev/null
@@ -1,573 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-# Copyright 2021 NVIDIA Corporation. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch QDQBERT model."""
-
-import unittest
-
-from transformers import QDQBertConfig, is_torch_available
-from transformers.testing_utils import require_pytorch_quantization, require_torch, slow, torch_device
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        QDQBertForMaskedLM,
-        QDQBertForMultipleChoice,
-        QDQBertForNextSentencePrediction,
-        QDQBertForQuestionAnswering,
-        QDQBertForSequenceClassification,
-        QDQBertForTokenClassification,
-        QDQBertLMHeadModel,
-        QDQBertModel,
-    )
-
-
-class QDQBertModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        # Set default quantizers before creating the model.
-        import pytorch_quantization.nn as quant_nn
-        from pytorch_quantization.tensor_quant import QuantDescriptor
-
-        # The default tensor quantizer is set to use Max calibration method
-        input_desc = QuantDescriptor(num_bits=8, calib_method="max")
-        # The default tensor quantizer is set to be per-channel quantization for weights
-        weight_desc = QuantDescriptor(num_bits=8, axis=((0,)))
-        quant_nn.QuantLinear.set_default_quant_desc_input(input_desc)
-        quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc)
-        # For the test cases, since QDQBert model is tested in one run without calibration, the quantized tensors are set as fake quantized tensors which give float type tensors in the end.
-        quant_nn.TensorQuantizer.use_fb_fake_quant = True
-
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return QDQBertConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = QDQBertModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = QDQBertModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = QDQBertLMHeadModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = QDQBertForMaskedLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_model_for_causal_lm_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = QDQBertLMHeadModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = QDQBertLMHeadModel(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_for_next_sequence_prediction(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = QDQBertForNextSentencePrediction(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=sequence_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, 2))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = QDQBertForQuestionAnswering(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = QDQBertForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = QDQBertForTokenClassification(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = QDQBertForMultipleChoice(config=config)
-        model.to(torch_device)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-@require_pytorch_quantization
-class QDQBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            QDQBertModel,
-            QDQBertForMaskedLM,
-            QDQBertForMultipleChoice,
-            QDQBertForNextSentencePrediction,
-            QDQBertForQuestionAnswering,
-            QDQBertForSequenceClassification,
-            QDQBertForTokenClassification,
-            QDQBertLMHeadModel,
-        )
-        if is_torch_available()
-        else ()
-    )
-    all_generative_model_classes = (QDQBertLMHeadModel,) if is_torch_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": QDQBertModel,
-            "fill-mask": QDQBertForMaskedLM,
-            "question-answering": QDQBertForQuestionAnswering,
-            "text-classification": QDQBertForSequenceClassification,
-            "text-generation": QDQBertLMHeadModel,
-            "token-classification": QDQBertForTokenClassification,
-            "zero-shot": QDQBertForSequenceClassification,
-        }
-        if is_torch_available()
-        else {}
-    )
-
-    def setUp(self):
-        self.model_tester = QDQBertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=QDQBertConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def test_for_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_causal_lm_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_for_causal_lm_as_decoder(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_next_sequence_prediction(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_next_sequence_prediction(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google-bert/bert-base-uncased"
-        model = QDQBertModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    # Override
-    def test_feed_forward_chunking(self):
-        # feed forward chunking is not supported in QDQBert
-        pass
-
-
-@require_torch
-@require_pytorch_quantization
-class QDQBertModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_no_head_absolute_embedding(self):
-        # Set default quantizers before creating the model.
-        import pytorch_quantization.nn as quant_nn
-        from pytorch_quantization.tensor_quant import QuantDescriptor
-
-        # The default tensor quantizer is set to use Max calibration method
-        input_desc = QuantDescriptor(num_bits=8, calib_method="max")
-        # The default tensor quantizer is set to be per-channel quantization for weights
-        weight_desc = QuantDescriptor(num_bits=8, axis=((0,)))
-        quant_nn.QuantLinear.set_default_quant_desc_input(input_desc)
-        quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc)
-
-        model = QDQBertModel.from_pretrained("google-bert/bert-base-uncased")
-        input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
-        attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-        output = model(input_ids, attention_mask=attention_mask)[0]
-        expected_shape = torch.Size((1, 11, 768))
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = torch.tensor(
-            [[[0.4571, -0.0735, 0.8594], [0.2774, -0.0278, 0.8794], [0.3548, -0.0473, 0.7593]]]
-        )
-        self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
diff --git a/tests/models/realm/__init__.py b/tests/models/realm/__init__.py
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/tests/models/realm/test_modeling_realm.py b/tests/models/realm/test_modeling_realm.py
deleted file mode 100644
index 07a3b9d4b3b965..00000000000000
--- a/tests/models/realm/test_modeling_realm.py
+++ /dev/null
@@ -1,554 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch REALM model."""
-
-import copy
-import unittest
-
-import numpy as np
-
-from transformers import RealmConfig, is_torch_available
-from transformers.testing_utils import require_torch, slow, torch_device
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        RealmEmbedder,
-        RealmForOpenQA,
-        RealmKnowledgeAugEncoder,
-        RealmReader,
-        RealmRetriever,
-        RealmScorer,
-        RealmTokenizer,
-    )
-
-
-class RealmModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        retriever_proj_size=128,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        span_hidden_size=50,
-        max_span_width=10,
-        reader_layer_norm_eps=1e-3,
-        reader_beam_size=4,
-        reader_seq_len=288 + 32,
-        num_block_records=13353718,
-        searcher_beam_size=8,
-        searcher_seq_len=64,
-        num_labels=3,
-        num_choices=4,
-        num_candidates=10,
-        scope=None,
-    ):
-        # General config
-        self.parent = parent
-        self.batch_size = batch_size
-        self.retriever_proj_size = retriever_proj_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-
-        # Reader config
-        self.span_hidden_size = span_hidden_size
-        self.max_span_width = max_span_width
-        self.reader_layer_norm_eps = reader_layer_norm_eps
-        self.reader_beam_size = reader_beam_size
-        self.reader_seq_len = reader_seq_len
-
-        # Searcher config
-        self.num_block_records = num_block_records
-        self.searcher_beam_size = searcher_beam_size
-        self.searcher_seq_len = searcher_seq_len
-
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.num_candidates = num_candidates
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        candiate_input_ids = ids_tensor([self.batch_size, self.num_candidates, self.seq_length], self.vocab_size)
-        reader_input_ids = ids_tensor([self.reader_beam_size, self.reader_seq_len], self.vocab_size)
-
-        input_mask = None
-        candiate_input_mask = None
-        reader_input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-            candiate_input_mask = random_attention_mask([self.batch_size, self.num_candidates, self.seq_length])
-            reader_input_mask = random_attention_mask([self.reader_beam_size, self.reader_seq_len])
-
-        token_type_ids = None
-        candidate_token_type_ids = None
-        reader_token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-            candidate_token_type_ids = ids_tensor(
-                [self.batch_size, self.num_candidates, self.seq_length], self.type_vocab_size
-            )
-            reader_token_type_ids = ids_tensor([self.reader_beam_size, self.reader_seq_len], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        # inputs with additional num_candidates axis.
-        scorer_encoder_inputs = (candiate_input_ids, candiate_input_mask, candidate_token_type_ids)
-        # reader inputs
-        reader_inputs = (reader_input_ids, reader_input_mask, reader_token_type_ids)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            scorer_encoder_inputs,
-            reader_inputs,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        )
-
-    def get_config(self):
-        return RealmConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            retriever_proj_size=self.retriever_proj_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            num_candidates=self.num_candidates,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_embedder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        scorer_encoder_inputs,
-        reader_inputs,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = RealmEmbedder(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.projected_score.shape, (self.batch_size, self.retriever_proj_size))
-
-    def create_and_check_encoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        scorer_encoder_inputs,
-        reader_inputs,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = RealmKnowledgeAugEncoder(config=config)
-        model.to(torch_device)
-        model.eval()
-        relevance_score = floats_tensor([self.batch_size, self.num_candidates])
-        result = model(
-            scorer_encoder_inputs[0],
-            attention_mask=scorer_encoder_inputs[1],
-            token_type_ids=scorer_encoder_inputs[2],
-            relevance_score=relevance_score,
-            labels=token_labels,
-        )
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size * self.num_candidates, self.seq_length, self.vocab_size)
-        )
-
-    def create_and_check_reader(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        scorer_encoder_inputs,
-        reader_inputs,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = RealmReader(config=config)
-        model.to(torch_device)
-        model.eval()
-        relevance_score = floats_tensor([self.reader_beam_size])
-        result = model(
-            reader_inputs[0],
-            attention_mask=reader_inputs[1],
-            token_type_ids=reader_inputs[2],
-            relevance_score=relevance_score,
-        )
-        self.parent.assertEqual(result.block_idx.shape, ())
-        self.parent.assertEqual(result.candidate.shape, ())
-        self.parent.assertEqual(result.start_pos.shape, ())
-        self.parent.assertEqual(result.end_pos.shape, ())
-
-    def create_and_check_scorer(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        scorer_encoder_inputs,
-        reader_inputs,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = RealmScorer(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            candidate_input_ids=scorer_encoder_inputs[0],
-            candidate_attention_mask=scorer_encoder_inputs[1],
-            candidate_token_type_ids=scorer_encoder_inputs[2],
-        )
-        self.parent.assertEqual(result.relevance_score.shape, (self.batch_size, self.num_candidates))
-        self.parent.assertEqual(result.query_score.shape, (self.batch_size, self.retriever_proj_size))
-        self.parent.assertEqual(
-            result.candidate_score.shape, (self.batch_size, self.num_candidates, self.retriever_proj_size)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            scorer_encoder_inputs,
-            reader_inputs,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class RealmModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            RealmEmbedder,
-            RealmKnowledgeAugEncoder,
-            # RealmScorer is excluded from common tests as it is a container model
-            # consisting of two RealmEmbedders & a simple inner product calculation.
-            # RealmScorer
-        )
-        if is_torch_available()
-        else ()
-    )
-    all_generative_model_classes = ()
-    pipeline_model_mapping = {} if is_torch_available() else {}
-
-    # disable these tests because there is no base_model in Realm
-    test_save_load_fast_init_from_base = False
-    test_save_load_fast_init_to_base = False
-
-    def setUp(self):
-        self.test_pruning = False
-        self.model_tester = RealmModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=RealmConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_embedder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_embedder(*config_and_inputs)
-
-    def test_encoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_encoder(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_embedder(*config_and_inputs)
-            self.model_tester.create_and_check_encoder(*config_and_inputs)
-
-    def test_scorer(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_scorer(*config_and_inputs)
-
-    def test_training(self):
-        if not self.model_tester.is_training:
-            return
-
-        config, *inputs = self.model_tester.prepare_config_and_inputs()
-        input_ids, token_type_ids, input_mask, scorer_encoder_inputs = inputs[0:4]
-        config.return_dict = True
-
-        tokenizer = RealmTokenizer.from_pretrained("google/realm-orqa-nq-openqa")
-
-        # RealmKnowledgeAugEncoder training
-        model = RealmKnowledgeAugEncoder(config)
-        model.to(torch_device)
-        model.train()
-
-        inputs_dict = {
-            "input_ids": scorer_encoder_inputs[0].to(torch_device),
-            "attention_mask": scorer_encoder_inputs[1].to(torch_device),
-            "token_type_ids": scorer_encoder_inputs[2].to(torch_device),
-            "relevance_score": floats_tensor([self.model_tester.batch_size, self.model_tester.num_candidates]),
-        }
-        inputs_dict["labels"] = torch.zeros(
-            (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
-        )
-        inputs = inputs_dict
-        loss = model(**inputs).loss
-        loss.backward()
-
-        # RealmForOpenQA training
-        openqa_config = copy.deepcopy(config)
-        openqa_config.vocab_size = 30522  # the retrieved texts will inevitably have more than 99 vocabs.
-        openqa_config.num_block_records = 5
-        openqa_config.searcher_beam_size = 2
-
-        block_records = np.array(
-            [
-                b"This is the first record.",
-                b"This is the second record.",
-                b"This is the third record.",
-                b"This is the fourth record.",
-                b"This is the fifth record.",
-            ],
-            dtype=object,
-        )
-        retriever = RealmRetriever(block_records, tokenizer)
-        model = RealmForOpenQA(openqa_config, retriever)
-        model.to(torch_device)
-        model.train()
-
-        inputs_dict = {
-            "input_ids": input_ids[:1].to(torch_device),
-            "attention_mask": input_mask[:1].to(torch_device),
-            "token_type_ids": token_type_ids[:1].to(torch_device),
-            "answer_ids": input_ids[:1].tolist(),
-        }
-        inputs = self._prepare_for_class(inputs_dict, RealmForOpenQA)
-        loss = model(**inputs).reader_output.loss
-        loss.backward()
-
-        # Test model.block_embedding_to
-        device = torch.device("cpu")
-        model.block_embedding_to(device)
-        loss = model(**inputs).reader_output.loss
-        loss.backward()
-        self.assertEqual(model.block_emb.device.type, device.type)
-
-    @slow
-    def test_embedder_from_pretrained(self):
-        model = RealmEmbedder.from_pretrained("google/realm-cc-news-pretrained-embedder")
-        self.assertIsNotNone(model)
-
-    @slow
-    def test_encoder_from_pretrained(self):
-        model = RealmKnowledgeAugEncoder.from_pretrained("google/realm-cc-news-pretrained-encoder")
-        self.assertIsNotNone(model)
-
-    @slow
-    def test_open_qa_from_pretrained(self):
-        model = RealmForOpenQA.from_pretrained("google/realm-orqa-nq-openqa")
-        self.assertIsNotNone(model)
-
-    @slow
-    def test_reader_from_pretrained(self):
-        model = RealmReader.from_pretrained("google/realm-orqa-nq-reader")
-        self.assertIsNotNone(model)
-
-    @slow
-    def test_scorer_from_pretrained(self):
-        model = RealmScorer.from_pretrained("google/realm-cc-news-pretrained-scorer")
-        self.assertIsNotNone(model)
-
-
-@require_torch
-class RealmModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_embedder(self):
-        retriever_projected_size = 128
-
-        model = RealmEmbedder.from_pretrained("google/realm-cc-news-pretrained-embedder")
-        input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
-        output = model(input_ids)[0]
-
-        expected_shape = torch.Size((1, retriever_projected_size))
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = torch.tensor([[-0.0714, -0.0837, -0.1314]])
-        self.assertTrue(torch.allclose(output[:, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_encoder(self):
-        num_candidates = 2
-        vocab_size = 30522
-
-        model = RealmKnowledgeAugEncoder.from_pretrained(
-            "google/realm-cc-news-pretrained-encoder", num_candidates=num_candidates
-        )
-        input_ids = torch.tensor([[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]])
-        relevance_score = torch.tensor([[0.3, 0.7]], dtype=torch.float32)
-        output = model(input_ids, relevance_score=relevance_score)[0]
-
-        expected_shape = torch.Size((2, 6, vocab_size))
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = torch.tensor([[[-11.0888, -11.2544], [-10.2170, -10.3874]]])
-
-        self.assertTrue(torch.allclose(output[1, :2, :2], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_open_qa(self):
-        from transformers.models.realm.retrieval_realm import RealmRetriever
-
-        tokenizer = RealmTokenizer.from_pretrained("google/realm-orqa-nq-openqa")
-        retriever = RealmRetriever.from_pretrained("google/realm-orqa-nq-openqa")
-
-        model = RealmForOpenQA.from_pretrained(
-            "google/realm-orqa-nq-openqa",
-            retriever=retriever,
-        )
-
-        question = "Who is the pioneer in modern computer science?"
-
-        question = tokenizer(
-            [question],
-            padding=True,
-            truncation=True,
-            max_length=model.config.searcher_seq_len,
-            return_tensors="pt",
-        ).to(model.device)
-
-        predicted_answer_ids = model(**question).predicted_answer_ids
-
-        predicted_answer = tokenizer.decode(predicted_answer_ids)
-        self.assertEqual(predicted_answer, "alan mathison turing")
-
-    @slow
-    def test_inference_reader(self):
-        config = RealmConfig(reader_beam_size=2, max_span_width=3)
-        model = RealmReader.from_pretrained("google/realm-orqa-nq-reader", config=config)
-
-        concat_input_ids = torch.arange(10).view((2, 5))
-        concat_token_type_ids = torch.tensor([[0, 0, 1, 1, 1], [0, 0, 1, 1, 1]], dtype=torch.int64)
-        concat_block_mask = torch.tensor([[0, 0, 1, 1, 0], [0, 0, 1, 1, 0]], dtype=torch.int64)
-        relevance_score = torch.tensor([0.3, 0.7], dtype=torch.float32)
-
-        output = model(
-            concat_input_ids,
-            token_type_ids=concat_token_type_ids,
-            relevance_score=relevance_score,
-            block_mask=concat_block_mask,
-            return_dict=True,
-        )
-
-        block_idx_expected_shape = torch.Size(())
-        start_pos_expected_shape = torch.Size((1,))
-        end_pos_expected_shape = torch.Size((1,))
-        self.assertEqual(output.block_idx.shape, block_idx_expected_shape)
-        self.assertEqual(output.start_pos.shape, start_pos_expected_shape)
-        self.assertEqual(output.end_pos.shape, end_pos_expected_shape)
-
-        expected_block_idx = torch.tensor(1)
-        expected_start_pos = torch.tensor(3)
-        expected_end_pos = torch.tensor(3)
-
-        self.assertTrue(torch.allclose(output.block_idx, expected_block_idx, atol=1e-4))
-        self.assertTrue(torch.allclose(output.start_pos, expected_start_pos, atol=1e-4))
-        self.assertTrue(torch.allclose(output.end_pos, expected_end_pos, atol=1e-4))
-
-    @slow
-    def test_inference_scorer(self):
-        num_candidates = 2
-
-        model = RealmScorer.from_pretrained("google/realm-cc-news-pretrained-scorer", num_candidates=num_candidates)
-
-        input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
-        candidate_input_ids = torch.tensor([[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]])
-        output = model(input_ids, candidate_input_ids=candidate_input_ids)[0]
-
-        expected_shape = torch.Size((1, 2))
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = torch.tensor([[0.7410, 0.7170]])
-        self.assertTrue(torch.allclose(output, expected_slice, atol=1e-4))
diff --git a/tests/models/realm/test_retrieval_realm.py b/tests/models/realm/test_retrieval_realm.py
deleted file mode 100644
index ba65a6afdd6771..00000000000000
--- a/tests/models/realm/test_retrieval_realm.py
+++ /dev/null
@@ -1,187 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import shutil
-import tempfile
-from unittest import TestCase
-from unittest.mock import patch
-
-import numpy as np
-from datasets import Dataset
-
-from transformers.models.realm.configuration_realm import RealmConfig
-from transformers.models.realm.retrieval_realm import _REALM_BLOCK_RECORDS_FILENAME, RealmRetriever
-from transformers.models.realm.tokenization_realm import VOCAB_FILES_NAMES, RealmTokenizer
-
-
-class RealmRetrieverTest(TestCase):
-    def setUp(self):
-        self.tmpdirname = tempfile.mkdtemp()
-        self.num_block_records = 5
-
-        # Realm tok
-        vocab_tokens = [
-            "[UNK]",
-            "[CLS]",
-            "[SEP]",
-            "[PAD]",
-            "[MASK]",
-            "test",
-            "question",
-            "this",
-            "is",
-            "the",
-            "first",
-            "second",
-            "third",
-            "fourth",
-            "fifth",
-            "record",
-            "want",
-            "##want",
-            "##ed",
-            "wa",
-            "un",
-            "runn",
-            "##ing",
-            ",",
-            "low",
-            "lowest",
-        ]
-        realm_tokenizer_path = os.path.join(self.tmpdirname, "realm_tokenizer")
-        os.makedirs(realm_tokenizer_path, exist_ok=True)
-        self.vocab_file = os.path.join(realm_tokenizer_path, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-
-        realm_block_records_path = os.path.join(self.tmpdirname, "realm_block_records")
-        os.makedirs(realm_block_records_path, exist_ok=True)
-
-    def get_tokenizer(self) -> RealmTokenizer:
-        return RealmTokenizer.from_pretrained(os.path.join(self.tmpdirname, "realm_tokenizer"))
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-    def get_config(self):
-        config = RealmConfig(num_block_records=self.num_block_records)
-        return config
-
-    def get_dummy_dataset(self):
-        dataset = Dataset.from_dict(
-            {
-                "id": ["0", "1"],
-                "question": ["foo", "bar"],
-                "answers": [["Foo", "Bar"], ["Bar"]],
-            }
-        )
-        return dataset
-
-    def get_dummy_block_records(self):
-        block_records = np.array(
-            [
-                b"This is the first record",
-                b"This is the second record",
-                b"This is the third record",
-                b"This is the fourth record",
-                b"This is the fifth record",
-                b"This is a longer longer longer record",
-            ],
-            dtype=object,
-        )
-        return block_records
-
-    def get_dummy_retriever(self):
-        retriever = RealmRetriever(
-            block_records=self.get_dummy_block_records(),
-            tokenizer=self.get_tokenizer(),
-        )
-        return retriever
-
-    def test_retrieve(self):
-        config = self.get_config()
-        retriever = self.get_dummy_retriever()
-        tokenizer = retriever.tokenizer
-
-        retrieved_block_ids = np.array([0, 3], dtype="long")
-        question_input_ids = tokenizer(["Test question"]).input_ids
-        answer_ids = tokenizer(
-            ["the fourth"],
-            add_special_tokens=False,
-            return_token_type_ids=False,
-            return_attention_mask=False,
-        ).input_ids
-        max_length = config.reader_seq_len
-
-        has_answers, start_pos, end_pos, concat_inputs = retriever(
-            retrieved_block_ids, question_input_ids, answer_ids=answer_ids, max_length=max_length, return_tensors="np"
-        )
-
-        self.assertEqual(len(has_answers), 2)
-        self.assertEqual(len(start_pos), 2)
-        self.assertEqual(len(end_pos), 2)
-        self.assertEqual(concat_inputs.input_ids.shape, (2, 10))
-        self.assertEqual(concat_inputs.attention_mask.shape, (2, 10))
-        self.assertEqual(concat_inputs.token_type_ids.shape, (2, 10))
-        self.assertEqual(concat_inputs.special_tokens_mask.shape, (2, 10))
-        self.assertEqual(
-            tokenizer.convert_ids_to_tokens(concat_inputs.input_ids[0]),
-            ["[CLS]", "test", "question", "[SEP]", "this", "is", "the", "first", "record", "[SEP]"],
-        )
-        self.assertEqual(
-            tokenizer.convert_ids_to_tokens(concat_inputs.input_ids[1]),
-            ["[CLS]", "test", "question", "[SEP]", "this", "is", "the", "fourth", "record", "[SEP]"],
-        )
-
-    def test_block_has_answer(self):
-        config = self.get_config()
-        retriever = self.get_dummy_retriever()
-        tokenizer = retriever.tokenizer
-
-        retrieved_block_ids = np.array([0, 3, 5], dtype="long")
-        question_input_ids = tokenizer(["Test question"]).input_ids
-        answer_ids = tokenizer(
-            ["the fourth", "longer longer"],
-            add_special_tokens=False,
-            return_token_type_ids=False,
-            return_attention_mask=False,
-        ).input_ids
-        max_length = config.reader_seq_len
-
-        has_answers, start_pos, end_pos, _ = retriever(
-            retrieved_block_ids, question_input_ids, answer_ids=answer_ids, max_length=max_length, return_tensors="np"
-        )
-
-        self.assertEqual([False, True, True], has_answers)
-        self.assertEqual([[-1, -1, -1], [6, -1, -1], [6, 7, 8]], start_pos)
-        self.assertEqual([[-1, -1, -1], [7, -1, -1], [7, 8, 9]], end_pos)
-
-    def test_save_load_pretrained(self):
-        retriever = self.get_dummy_retriever()
-        retriever.save_pretrained(os.path.join(self.tmpdirname, "realm_block_records"))
-
-        # Test local path
-        retriever = retriever.from_pretrained(os.path.join(self.tmpdirname, "realm_block_records"))
-        self.assertEqual(retriever.block_records[0], b"This is the first record")
-
-        # Test mocked remote path
-        with patch("transformers.models.realm.retrieval_realm.hf_hub_download") as mock_hf_hub_download:
-            mock_hf_hub_download.return_value = os.path.join(
-                os.path.join(self.tmpdirname, "realm_block_records"), _REALM_BLOCK_RECORDS_FILENAME
-            )
-            retriever = RealmRetriever.from_pretrained("google/realm-cc-news-pretrained-openqa")
-
-        self.assertEqual(retriever.block_records[0], b"This is the first record")
diff --git a/tests/models/realm/test_tokenization_realm.py b/tests/models/realm/test_tokenization_realm.py
deleted file mode 100644
index 85c478837e53cf..00000000000000
--- a/tests/models/realm/test_tokenization_realm.py
+++ /dev/null
@@ -1,322 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-
-from transformers import RealmTokenizerFast
-from transformers.models.bert.tokenization_bert import (
-    VOCAB_FILES_NAMES,
-    BasicTokenizer,
-    WordpieceTokenizer,
-    _is_control,
-    _is_punctuation,
-    _is_whitespace,
-)
-from transformers.models.realm.tokenization_realm import RealmTokenizer
-from transformers.testing_utils import require_tokenizers, slow
-
-from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english
-
-
-@require_tokenizers
-class RealmTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "google/realm-cc-news-pretrained-embedder"
-    tokenizer_class = RealmTokenizer
-    rust_tokenizer_class = RealmTokenizerFast
-    test_rust_tokenizer = True
-    space_between_special_tokens = True
-    from_pretrained_filter = filter_non_english
-
-    def setUp(self):
-        super().setUp()
-
-        vocab_tokens = [
-            "[UNK]",
-            "[CLS]",
-            "[SEP]",
-            "[PAD]",
-            "[MASK]",
-            "want",
-            "##want",
-            "##ed",
-            "wa",
-            "un",
-            "runn",
-            "##ing",
-            ",",
-            "low",
-            "lowest",
-        ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "UNwant\u00e9d,running"
-        output_text = "unwanted, running"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = self.tokenizer_class(self.vocab_file)
-
-        tokens = tokenizer.tokenize("UNwant\u00e9d,running")
-        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])
-
-    def test_rust_and_python_full_tokenizers(self):
-        if not self.test_rust_tokenizer:
-            return
-
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        sequence = "UNwant\u00e9d,running"
-
-        tokens = tokenizer.tokenize(sequence)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-        # With lower casing
-        tokenizer = self.get_tokenizer(do_lower_case=True)
-        rust_tokenizer = self.get_rust_tokenizer(do_lower_case=True)
-
-        sequence = "UNwant\u00e9d,running"
-
-        tokens = tokenizer.tokenize(sequence)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-    def test_chinese(self):
-        tokenizer = BasicTokenizer()
-
-        self.assertListEqual(tokenizer.tokenize("ah\u535a\u63a8zz"), ["ah", "\u535a", "\u63a8", "zz"])
-
-    def test_basic_tokenizer_lower(self):
-        tokenizer = BasicTokenizer(do_lower_case=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
-
-    def test_basic_tokenizer_lower_strip_accents_false(self):
-        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hällo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["h\u00e9llo"])
-
-    def test_basic_tokenizer_lower_strip_accents_true(self):
-        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
-
-    def test_basic_tokenizer_lower_strip_accents_default(self):
-        tokenizer = BasicTokenizer(do_lower_case=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
-
-    def test_basic_tokenizer_no_lower(self):
-        tokenizer = BasicTokenizer(do_lower_case=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_no_lower_strip_accents_false(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HäLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_no_lower_strip_accents_true(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HaLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_respects_never_split_tokens(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
-        )
-
-    def test_wordpiece_tokenizer(self):
-        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
-
-        vocab = {}
-        for i, token in enumerate(vocab_tokens):
-            vocab[token] = i
-        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
-
-        self.assertListEqual(tokenizer.tokenize(""), [])
-
-        self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"])
-
-        self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
-
-    def test_is_whitespace(self):
-        self.assertTrue(_is_whitespace(" "))
-        self.assertTrue(_is_whitespace("\t"))
-        self.assertTrue(_is_whitespace("\r"))
-        self.assertTrue(_is_whitespace("\n"))
-        self.assertTrue(_is_whitespace("\u00a0"))
-
-        self.assertFalse(_is_whitespace("A"))
-        self.assertFalse(_is_whitespace("-"))
-
-    def test_is_control(self):
-        self.assertTrue(_is_control("\u0005"))
-
-        self.assertFalse(_is_control("A"))
-        self.assertFalse(_is_control(" "))
-        self.assertFalse(_is_control("\t"))
-        self.assertFalse(_is_control("\r"))
-
-    def test_is_punctuation(self):
-        self.assertTrue(_is_punctuation("-"))
-        self.assertTrue(_is_punctuation("$"))
-        self.assertTrue(_is_punctuation("`"))
-        self.assertTrue(_is_punctuation("."))
-
-        self.assertFalse(_is_punctuation("A"))
-        self.assertFalse(_is_punctuation(" "))
-
-    def test_clean_text(self):
-        tokenizer = self.get_tokenizer()
-
-        # Example taken from the issue https://github.com/huggingface/tokenizers/issues/340
-        self.assertListEqual([tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]])
-
-        if self.test_rust_tokenizer:
-            rust_tokenizer = self.get_rust_tokenizer()
-            self.assertListEqual(
-                [rust_tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]]
-            )
-
-    @slow
-    def test_sequence_builders(self):
-        tokenizer = self.tokenizer_class.from_pretrained("google-bert/bert-base-uncased")
-
-        text = tokenizer.encode("sequence builders", add_special_tokens=False)
-        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
-
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-
-        assert encoded_sentence == [101] + text + [102]
-        assert encoded_pair == [101] + text + [102] + text_2 + [102]
-
-    def test_offsets_with_special_characters(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
-                tokens = tokenizer_r.encode_plus(
-                    sentence,
-                    return_attention_mask=False,
-                    return_token_type_ids=False,
-                    return_offsets_mapping=True,
-                    add_special_tokens=True,
-                )
-
-                do_lower_case = tokenizer_r.do_lower_case if hasattr(tokenizer_r, "do_lower_case") else False
-                expected_results = (
-                    [
-                        ((0, 0), tokenizer_r.cls_token),
-                        ((0, 1), "A"),
-                        ((1, 2), ","),
-                        ((3, 5), "na"),
-                        ((5, 6), "##ï"),
-                        ((6, 8), "##ve"),
-                        ((9, 15), tokenizer_r.mask_token),
-                        ((16, 21), "Allen"),
-                        ((21, 23), "##NL"),
-                        ((23, 24), "##P"),
-                        ((25, 33), "sentence"),
-                        ((33, 34), "."),
-                        ((0, 0), tokenizer_r.sep_token),
-                    ]
-                    if not do_lower_case
-                    else [
-                        ((0, 0), tokenizer_r.cls_token),
-                        ((0, 1), "a"),
-                        ((1, 2), ","),
-                        ((3, 8), "naive"),
-                        ((9, 15), tokenizer_r.mask_token),
-                        ((16, 21), "allen"),
-                        ((21, 23), "##nl"),
-                        ((23, 24), "##p"),
-                        ((25, 33), "sentence"),
-                        ((33, 34), "."),
-                        ((0, 0), tokenizer_r.sep_token),
-                    ]
-                )
-
-                self.assertEqual(
-                    [e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"])
-                )
-                self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
-
-    @slow
-    def test_batch_encode_candidates(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                text = [["Hello world!", "Nice to meet you!"], ["The cute cat.", "The adorable dog."]]
-
-                encoded_sentence_r = tokenizer_r.batch_encode_candidates(text, max_length=10, return_tensors="np")
-                encoded_sentence_p = tokenizer_p.batch_encode_candidates(text, max_length=10, return_tensors="np")
-
-                expected_shape = (2, 2, 10)
-
-                self.assertEqual(encoded_sentence_r["input_ids"].shape, expected_shape)
-                self.assertEqual(encoded_sentence_r["attention_mask"].shape, expected_shape)
-                self.assertEqual(encoded_sentence_r["token_type_ids"].shape, expected_shape)
-
-                self.assertEqual(encoded_sentence_p["input_ids"].shape, expected_shape)
-                self.assertEqual(encoded_sentence_p["attention_mask"].shape, expected_shape)
-                self.assertEqual(encoded_sentence_p["token_type_ids"].shape, expected_shape)
diff --git a/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py b/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py
index 08c3bc61787ec0..d7b85e7b48bcbe 100644
--- a/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py
+++ b/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py
@@ -23,7 +23,6 @@
 from ...test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
 from ..bert.test_modeling_bert import BertModelTester
 from ..speech_to_text.test_modeling_speech_to_text import Speech2TextModelTester
-from ..speech_to_text_2.test_modeling_speech_to_text_2 import Speech2Text2StandaloneDecoderModelTester
 from ..wav2vec2.test_modeling_wav2vec2 import Wav2Vec2ModelTester
 
 
@@ -33,7 +32,6 @@
 
     from transformers import (
         BertLMHeadModel,
-        Speech2Text2ForCausalLM,
         SpeechEncoderDecoderConfig,
         SpeechEncoderDecoderModel,
         Wav2Vec2Model,
@@ -583,43 +581,3 @@ def test_save_and_load_from_pretrained(self):
     # all published pretrained models are Speech2TextModel != Speech2TextEncoder
     def test_real_model_save_load_from_pretrained(self):
         pass
-
-
-@require_torch
-class Wav2Vec2Speech2Text2(EncoderDecoderMixin, unittest.TestCase):
-    def get_encoder_decoder_model(self, config, decoder_config):
-        encoder_model = Wav2Vec2Model(config).eval()
-        decoder_model = Speech2Text2ForCausalLM(decoder_config).eval()
-        return encoder_model, decoder_model
-
-    def prepare_config_and_inputs(self):
-        model_tester_encoder = Wav2Vec2ModelTester(self, batch_size=13)
-        model_tester_decoder = Speech2Text2StandaloneDecoderModelTester(
-            self, batch_size=13, d_model=32, max_position_embeddings=512
-        )
-        encoder_config_and_inputs = model_tester_encoder.prepare_config_and_inputs()
-        decoder_config_and_inputs = model_tester_decoder.prepare_config_and_inputs()
-        (
-            config,
-            input_values,
-            input_mask,
-        ) = encoder_config_and_inputs
-        (decoder_config, decoder_input_ids, decoder_attention_mask, _) = decoder_config_and_inputs
-
-        # make sure that cross attention layers are added
-        decoder_config.add_cross_attention = True
-        #  disable cache for now
-        decoder_config.use_cache = False
-        return {
-            "config": config,
-            "input_values": input_values,
-            "attention_mask": input_mask,
-            "decoder_config": decoder_config,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-            "labels": decoder_input_ids,
-        }
-
-    # there are no published pretrained Speech2Text2ForCausalLM for now
-    def test_real_model_save_load_from_pretrained(self):
-        pass
diff --git a/tests/models/speech_to_text_2/__init__.py b/tests/models/speech_to_text_2/__init__.py
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py b/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py
deleted file mode 100644
index fffa16aa30079d..00000000000000
--- a/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py
+++ /dev/null
@@ -1,216 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch Speech2Text model."""
-
-import unittest
-
-from transformers import Speech2Text2Config
-from transformers.testing_utils import is_torch_available, require_torch, torch_device
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-
-    from transformers.models.speech_to_text_2.modeling_speech_to_text_2 import (
-        Speech2Text2Decoder,
-        Speech2Text2ForCausalLM,
-    )
-
-
-@require_torch
-class Speech2Text2StandaloneDecoderModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        d_model=16,
-        decoder_seq_length=7,
-        is_training=True,
-        is_decoder=True,
-        use_attention_mask=True,
-        use_cache=False,
-        use_labels=True,
-        decoder_start_token_id=2,
-        decoder_ffn_dim=32,
-        decoder_layers=2,
-        decoder_attention_heads=4,
-        max_position_embeddings=30,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.decoder_seq_length = decoder_seq_length
-        # For common tests
-        self.seq_length = self.decoder_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-
-        self.vocab_size = vocab_size
-        self.d_model = d_model
-        self.hidden_size = d_model
-        self.num_hidden_layers = decoder_layers
-        self.decoder_layers = decoder_layers
-        self.decoder_ffn_dim = decoder_ffn_dim
-        self.decoder_attention_heads = decoder_attention_heads
-        self.num_attention_heads = decoder_attention_heads
-        self.eos_token_id = eos_token_id
-        self.bos_token_id = bos_token_id
-        self.pad_token_id = pad_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.use_cache = use_cache
-        self.max_position_embeddings = max_position_embeddings
-
-        self.scope = None
-        self.decoder_key_length = decoder_seq_length
-        self.base_model_out_len = 2
-        self.decoder_attention_idx = 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-        lm_labels = None
-        if self.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        config = Speech2Text2Config(
-            vocab_size=self.vocab_size,
-            d_model=self.d_model,
-            decoder_layers=self.decoder_layers,
-            decoder_ffn_dim=self.decoder_ffn_dim,
-            decoder_attention_heads=self.decoder_attention_heads,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            use_cache=self.use_cache,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-            max_position_embeddings=self.max_position_embeddings,
-        )
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        )
-
-    def create_and_check_decoder_model_past(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        lm_labels,
-    ):
-        config.use_cache = True
-        model = Speech2Text2Decoder(config=config).to(torch_device).eval()
-        input_ids = input_ids[:2]
-
-        input_ids[input_ids == 0] += 1
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        past_key_values = outputs["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((2, 1), config.vocab_size - 1) + 1
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-
-        print(next_input_ids)
-
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
-
-        # test that outputs are equal for slice
-        assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-@require_torch
-class Speech2Text2StandaloneDecoderModelTest(
-    ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase
-):
-    all_model_classes = (Speech2Text2Decoder, Speech2Text2ForCausalLM) if is_torch_available() else ()
-    all_generative_model_classes = (Speech2Text2ForCausalLM,) if is_torch_available() else ()
-    pipeline_model_mapping = {"text-generation": Speech2Text2ForCausalLM} if is_torch_available() else {}
-    fx_compatible = True
-    test_pruning = False
-
-    def setUp(
-        self,
-    ):
-        self.model_tester = Speech2Text2StandaloneDecoderModelTester(self, is_training=False)
-        self.config_tester = ConfigTester(self, config_class=Speech2Text2Config)
-
-    # not implemented currently
-    def test_inputs_embeds(self):
-        pass
-
-    # speech2text2 has no base model
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    # speech2text2 has no base model
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_decoder_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
-
-    # decoder cannot keep gradients
-    def test_retain_grad_hidden_states_attentions(self):
-        return
diff --git a/tests/models/speech_to_text_2/test_tokenization_speech_to_text_2.py b/tests/models/speech_to_text_2/test_tokenization_speech_to_text_2.py
deleted file mode 100644
index df433d67d96230..00000000000000
--- a/tests/models/speech_to_text_2/test_tokenization_speech_to_text_2.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import json
-import os
-import tempfile
-import unittest
-
-from transformers.models.speech_to_text_2 import Speech2Text2Tokenizer
-from transformers.models.speech_to_text_2.tokenization_speech_to_text_2 import VOCAB_FILES_NAMES
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "facebook/s2t-wav2vec2-large-en-de"
-    tokenizer_class = Speech2Text2Tokenizer
-    test_rust_tokenizer = False
-
-    def setUp(self):
-        super().setUp()
-
-        vocab = "<s> <pad> </s> <unk> here@@ a couple of@@ words for the he@@ re@@ vocab".split(" ")
-        merges = ["he re</w> 123", "here a 1456"]
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-
-        self.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
-
-        self.tmpdirname = tempfile.mkdtemp()
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-
-        with open(self.merges_file, "w") as fp:
-            fp.write("\n".join(merges))
-
-    def test_get_vocab(self):
-        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
-
-        self.assertEqual(vocab_keys[0], "<s>")
-        self.assertEqual(vocab_keys[1], "<pad>")
-        self.assertEqual(vocab_keys[-1], "vocab")
-        self.assertEqual(len(vocab_keys), 14)
-
-    def test_vocab_size(self):
-        self.assertEqual(self.get_tokenizer().vocab_size, 14)
-
-    def test_tokenizer_decode(self):
-        tokenizer = Speech2Text2Tokenizer.from_pretrained(self.tmpdirname)
-
-        # make sure @@ is correctly concatenated
-        token_ids = [4, 6, 8, 7, 10]  # ["here@@", "couple", "words", "of@@", "the"]
-        output_string = tokenizer.decode(token_ids)
-
-        self.assertTrue(output_string == "herecouple words ofthe")
-
-    def test_load_no_merges_file(self):
-        tokenizer = Speech2Text2Tokenizer.from_pretrained(self.tmpdirname)
-
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            tokenizer.save_pretrained(tmp_dirname)
-            os.remove(os.path.join(tmp_dirname, "merges.txt"))
-
-            # load tokenizer without merges file should not throw an error
-            tokenizer = Speech2Text2Tokenizer.from_pretrained(tmp_dirname)
-
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            # save tokenizer and load again
-            tokenizer.save_pretrained(tmp_dirname)
-            tokenizer = Speech2Text2Tokenizer.from_pretrained(tmp_dirname)
-
-        self.assertIsNotNone(tokenizer)
-
-    # overwrite since merges_file is optional
-    def test_tokenizer_slow_store_full_signature(self):
-        if not self.test_slow_tokenizer:
-            return
-
-        signature = inspect.signature(self.tokenizer_class.__init__)
-        tokenizer = self.get_tokenizer()
-
-        for parameter_name, parameter in signature.parameters.items():
-            if parameter.default != inspect.Parameter.empty and parameter_name != "merges_file":
-                self.assertIn(parameter_name, tokenizer.init_kwargs)
diff --git a/tests/models/tvlt/__init__.py b/tests/models/tvlt/__init__.py
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/tests/models/tvlt/test_feature_extraction_tvlt.py b/tests/models/tvlt/test_feature_extraction_tvlt.py
deleted file mode 100644
index a0b3f7a916e457..00000000000000
--- a/tests/models/tvlt/test_feature_extraction_tvlt.py
+++ /dev/null
@@ -1,182 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the TVLT feature extraction."""
-
-import itertools
-import random
-import unittest
-
-import numpy as np
-
-from transformers import TvltFeatureExtractor, is_datasets_available
-from transformers.testing_utils import require_torch, require_torchaudio
-from transformers.utils.import_utils import is_torch_available
-
-from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
-
-
-if is_torch_available():
-    import torch
-
-if is_datasets_available():
-    from datasets import load_dataset
-
-global_rng = random.Random()
-
-
-# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
-def floats_list(shape, scale=1.0, rng=None, name=None):
-    """Creates a random float32 tensor"""
-    if rng is None:
-        rng = global_rng
-
-    values = []
-    for batch_idx in range(shape[0]):
-        values.append([])
-        for _ in range(shape[1]):
-            values[-1].append(rng.random() * scale)
-
-    return values
-
-
-class TvltFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        min_seq_length=400,
-        max_seq_length=2000,
-        spectrogram_length=2048,
-        feature_size=128,
-        num_audio_channels=1,
-        hop_length=512,
-        chunk_length=30,
-        sampling_rate=44100,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.min_seq_length = min_seq_length
-        self.max_seq_length = max_seq_length
-        self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
-        self.spectrogram_length = spectrogram_length
-        self.feature_size = feature_size
-        self.num_audio_channels = num_audio_channels
-        self.hop_length = hop_length
-        self.chunk_length = chunk_length
-        self.sampling_rate = sampling_rate
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "spectrogram_length": self.spectrogram_length,
-            "feature_size": self.feature_size,
-            "num_audio_channels": self.num_audio_channels,
-            "hop_length": self.hop_length,
-            "chunk_length": self.chunk_length,
-            "sampling_rate": self.sampling_rate,
-        }
-
-    def prepare_inputs_for_common(self, equal_length=False, numpify=False):
-        def _flatten(list_of_lists):
-            return list(itertools.chain(*list_of_lists))
-
-        if equal_length:
-            speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)]
-        else:
-            # make sure that inputs increase in size
-            speech_inputs = [
-                floats_list((x, self.feature_size))
-                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
-            ]
-        if numpify:
-            speech_inputs = [np.asarray(x) for x in speech_inputs]
-        return speech_inputs
-
-
-@require_torch
-@require_torchaudio
-class TvltFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
-    feature_extraction_class = TvltFeatureExtractor
-
-    def setUp(self):
-        self.feat_extract_tester = TvltFeatureExtractionTester(self)
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "spectrogram_length"))
-        self.assertTrue(hasattr(feature_extractor, "feature_size"))
-        self.assertTrue(hasattr(feature_extractor, "num_audio_channels"))
-        self.assertTrue(hasattr(feature_extractor, "hop_length"))
-        self.assertTrue(hasattr(feature_extractor, "chunk_length"))
-        self.assertTrue(hasattr(feature_extractor, "sampling_rate"))
-
-    def test_call(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-
-        # create three inputs of length 800, 1000, and 1200
-        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
-
-        # Test not batched input
-        encoded_audios = feature_extractor(np_speech_inputs[0], return_tensors="np", sampling_rate=44100).audio_values
-
-        self.assertTrue(encoded_audios.ndim == 4)
-        self.assertTrue(encoded_audios.shape[-1] == feature_extractor.feature_size)
-        self.assertTrue(encoded_audios.shape[-2] <= feature_extractor.spectrogram_length)
-        self.assertTrue(encoded_audios.shape[-3] == feature_extractor.num_channels)
-
-        # Test batched
-        encoded_audios = feature_extractor(np_speech_inputs, return_tensors="np", sampling_rate=44100).audio_values
-
-        self.assertTrue(encoded_audios.ndim == 4)
-        self.assertTrue(encoded_audios.shape[-1] == feature_extractor.feature_size)
-        self.assertTrue(encoded_audios.shape[-2] <= feature_extractor.spectrogram_length)
-        self.assertTrue(encoded_audios.shape[-3] == feature_extractor.num_channels)
-
-        # Test audio masking
-        encoded_audios = feature_extractor(
-            np_speech_inputs, return_tensors="np", sampling_rate=44100, mask_audio=True
-        ).audio_values
-
-        self.assertTrue(encoded_audios.ndim == 4)
-        self.assertTrue(encoded_audios.shape[-1] == feature_extractor.feature_size)
-        self.assertTrue(encoded_audios.shape[-2] <= feature_extractor.spectrogram_length)
-        self.assertTrue(encoded_audios.shape[-3] == feature_extractor.num_channels)
-
-        # Test 2-D numpy arrays are batched.
-        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
-        np_speech_inputs = np.asarray(speech_inputs)
-        encoded_audios = feature_extractor(np_speech_inputs, return_tensors="np", sampling_rate=44100).audio_values
-        self.assertTrue(encoded_audios.ndim == 4)
-        self.assertTrue(encoded_audios.shape[-1] == feature_extractor.feature_size)
-        self.assertTrue(encoded_audios.shape[-2] <= feature_extractor.spectrogram_length)
-        self.assertTrue(encoded_audios.shape[-3] == feature_extractor.num_channels)
-
-    def _load_datasamples(self, num_samples):
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def test_integration(self):
-        input_speech = self._load_datasamples(1)
-        feature_extractor = TvltFeatureExtractor()
-        audio_values = feature_extractor(input_speech, return_tensors="pt").audio_values
-
-        self.assertEqual(audio_values.shape, (1, 1, 192, 128))
-
-        expected_slice = torch.tensor([[-0.3032, -0.2708], [-0.4434, -0.4007]])
-        self.assertTrue(torch.allclose(audio_values[0, 0, :2, :2], expected_slice, atol=1e-4))
diff --git a/tests/models/tvlt/test_image_processor_tvlt.py b/tests/models/tvlt/test_image_processor_tvlt.py
deleted file mode 100644
index c2974da6d8cd19..00000000000000
--- a/tests/models/tvlt/test_image_processor_tvlt.py
+++ /dev/null
@@ -1,294 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the TVLT image processor."""
-
-import unittest
-
-import numpy as np
-
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_image_processing_common import ImageProcessingTestMixin
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import TvltImageProcessor
-
-
-def prepare_video(image_processor_tester, width=10, height=10, numpify=False, torchify=False):
-    """This function prepares a video as a list of PIL images/NumPy arrays/PyTorch tensors."""
-
-    video = []
-    for i in range(image_processor_tester.num_frames):
-        video.append(np.random.randint(255, size=(image_processor_tester.num_channels, width, height), dtype=np.uint8))
-
-    if not numpify and not torchify:
-        # PIL expects the channel dimension as last dimension
-        video = [Image.fromarray(np.moveaxis(frame, 0, -1)) for frame in video]
-
-    if torchify:
-        video = [torch.from_numpy(frame) for frame in video]
-
-    return video
-
-
-def prepare_video_inputs(image_processor_tester, equal_resolution=False, numpify=False, torchify=False):
-    """This function prepares a batch of videos: a list of list of PIL images, or a list of list of numpy arrays if
-    one specifies numpify=True, or a list of list of PyTorch tensors if one specifies torchify=True.
-    One can specify whether the videos are of the same resolution or not.
-    """
-
-    assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
-
-    video_inputs = []
-    for i in range(image_processor_tester.batch_size):
-        if equal_resolution:
-            width = height = image_processor_tester.max_resolution
-        else:
-            width, height = np.random.choice(
-                np.arange(image_processor_tester.min_resolution, image_processor_tester.max_resolution), 2
-            )
-            video = prepare_video(
-                image_processor_tester=image_processor_tester,
-                width=width,
-                height=height,
-                numpify=numpify,
-                torchify=torchify,
-            )
-        video_inputs.append(video)
-
-    return video_inputs
-
-
-class TvltImageProcessorTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        num_frames=4,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-        do_center_crop=True,
-        crop_size=None,
-    ):
-        size = size if size is not None else {"shortest_edge": 18}
-        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
-
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.num_frames = num_frames
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.do_center_crop = do_center_crop
-        self.crop_size = crop_size
-
-    def prepare_image_processor_dict(self):
-        return {
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_normalize": self.do_normalize,
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_center_crop": self.do_center_crop,
-            "crop_size": self.crop_size,
-        }
-
-
-@require_torch
-@require_vision
-class TvltImageProcessorTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = TvltImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        self.image_processor_tester = TvltImageProcessorTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processor, "image_mean"))
-        self.assertTrue(hasattr(image_processor, "image_std"))
-        self.assertTrue(hasattr(image_processor, "do_normalize"))
-        self.assertTrue(hasattr(image_processor, "do_resize"))
-        self.assertTrue(hasattr(image_processor, "do_center_crop"))
-        self.assertTrue(hasattr(image_processor, "size"))
-
-    def test_call_pil(self):
-        # Initialize image_processor
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        # create random PIL videos
-        video_inputs = prepare_video_inputs(self.image_processor_tester, equal_resolution=False)
-        for video in video_inputs:
-            self.assertIsInstance(video, list)
-            self.assertIsInstance(video[0], Image.Image)
-
-        # Test not batched input
-        encoded_videos = image_processor(video_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_videos.shape,
-            (
-                1,
-                self.image_processor_tester.num_frames,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_videos = image_processor(video_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_videos.shape,
-            (
-                self.image_processor_tester.batch_size,
-                self.image_processor_tester.num_frames,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
-
-    def test_call_numpy(self):
-        # Initialize image_processor
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
-        video_inputs = prepare_video_inputs(self.image_processor_tester, equal_resolution=False, numpify=True)
-        for video in video_inputs:
-            self.assertIsInstance(video, list)
-            self.assertIsInstance(video[0], np.ndarray)
-
-        # Test not batched input
-        encoded_videos = image_processor(video_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_videos.shape,
-            (
-                1,
-                self.image_processor_tester.num_frames,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_videos = image_processor(video_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_videos.shape,
-            (
-                self.image_processor_tester.batch_size,
-                self.image_processor_tester.num_frames,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
-
-    def test_call_numpy_4_channels(self):
-        # Initialize image_processor
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
-        self.image_processor_tester.num_channels = 4
-        video_inputs = prepare_video_inputs(self.image_processor_tester, equal_resolution=False, numpify=True)
-        for video in video_inputs:
-            self.assertIsInstance(video, list)
-            self.assertIsInstance(video[0], np.ndarray)
-
-        # Test not batched input
-        encoded_videos = image_processor(
-            video_inputs[0], return_tensors="pt", input_data_format="channels_first", image_mean=0, image_std=1
-        ).pixel_values
-        self.assertEqual(
-            encoded_videos.shape,
-            (
-                1,
-                self.image_processor_tester.num_frames,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_videos = image_processor(
-            video_inputs, return_tensors="pt", input_data_format="channels_first", image_mean=0, image_std=1
-        ).pixel_values
-        self.assertEqual(
-            encoded_videos.shape,
-            (
-                self.image_processor_tester.batch_size,
-                self.image_processor_tester.num_frames,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
-        self.image_processor_tester.num_channels = 3
-
-    def test_call_pytorch(self):
-        # Initialize image_processor
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        # create random PyTorch tensors
-        video_inputs = prepare_video_inputs(self.image_processor_tester, equal_resolution=False, torchify=True)
-        for video in video_inputs:
-            self.assertIsInstance(video, list)
-            self.assertIsInstance(video[0], torch.Tensor)
-
-        # Test not batched input
-        encoded_videos = image_processor(video_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_videos.shape,
-            (
-                1,
-                self.image_processor_tester.num_frames,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_videos = image_processor(video_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_videos.shape,
-            (
-                self.image_processor_tester.batch_size,
-                self.image_processor_tester.num_frames,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
diff --git a/tests/models/tvlt/test_modeling_tvlt.py b/tests/models/tvlt/test_modeling_tvlt.py
deleted file mode 100644
index 574559a7a2f151..00000000000000
--- a/tests/models/tvlt/test_modeling_tvlt.py
+++ /dev/null
@@ -1,625 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch TVLT model."""
-
-import copy
-import inspect
-import unittest
-
-import numpy as np
-from huggingface_hub import hf_hub_download
-
-from transformers import (
-    TvltConfig,
-    is_datasets_available,
-    is_speech_available,
-    is_torch_available,
-    is_vision_available,
-)
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
-from transformers.utils import cached_property
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-    import torch.nn as nn
-
-    from transformers import TvltForAudioVisualClassification, TvltForPreTraining, TvltModel
-
-
-if is_datasets_available():
-    from datasets import load_dataset
-
-if is_vision_available():
-    from transformers import TvltImageProcessor
-
-if is_speech_available():
-    from transformers import TvltFeatureExtractor
-
-
-class TvltModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=2,
-        image_size=32,
-        spectrogram_length=32,
-        frequency_length=16,
-        image_patch_size=[2, 2],
-        audio_patch_size=[2, 2],
-        num_image_channels=3,
-        num_audio_channels=1,
-        num_frames=2,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=128,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.0,
-        attention_probs_dropout_prob=0.0,
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        qkv_bias=True,
-        use_mean_pooling=True,
-        decoder_num_attention_heads=4,
-        decoder_hidden_size=32,
-        decoder_num_hidden_layers=2,
-        decoder_intermediate_size=128,
-        image_mask_ratio=0.75,
-        audio_mask_ratio=0.15,
-        audio_mask_type="frame-level",
-        task_matching=True,
-        task_mae=True,
-        num_labels=1,
-        is_training=True,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.spectrogram_length = spectrogram_length
-        self.frequency_length = frequency_length
-        self.image_patch_size = image_patch_size
-        self.audio_patch_size = audio_patch_size
-        self.num_image_channels = num_image_channels
-        self.num_audio_channels = num_audio_channels
-        self.num_frames = num_frames
-
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-        self.qkv_bias = qkv_bias
-        self.use_mean_pooling = use_mean_pooling
-
-        self.decoder_num_attention_heads = decoder_num_attention_heads
-        self.decoder_hidden_size = decoder_hidden_size
-        self.decoder_num_hidden_layers = decoder_num_hidden_layers
-        self.decoder_intermediate_size = decoder_intermediate_size
-        self.image_mask_ratio = image_mask_ratio
-        self.audio_mask_ratio = audio_mask_ratio
-
-        self.task_matching = task_matching
-        self.task_mae = task_mae
-        self.num_labels = num_labels
-
-        self.expected_pixel_seq_len = (self.image_size // self.image_patch_size[0]) ** 2 * self.num_frames
-        self.expected_audio_seq_len = (self.spectrogram_length // self.audio_patch_size[0]) * (
-            self.frequency_length // self.audio_patch_size[1]
-        )
-        # we set the expected sequence length (which is used in several tests)
-        # this is equal to the seq length of number of image/video patches + number of audio patches
-        self.expected_seq_len = self.expected_pixel_seq_len + self.expected_audio_seq_len + 1
-
-        self.image_mae_output_dim = image_patch_size[0] ** 2 * num_image_channels
-        self.audio_mae_output_dim = audio_patch_size[0] * audio_patch_size[1] * num_audio_channels
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor(
-            [self.batch_size, self.num_frames, self.num_image_channels, self.image_size, self.image_size]
-        )
-        audio_values = floats_tensor(
-            [self.batch_size, self.num_audio_channels, self.spectrogram_length, self.frequency_length]
-        )
-
-        pixel_mask = floats_tensor([self.batch_size, self.expected_pixel_seq_len])
-        audio_mask = floats_tensor([self.batch_size, self.expected_audio_seq_len])
-
-        config = self.get_config()
-
-        return (config, pixel_values, audio_values, pixel_mask, audio_mask)
-
-    def prepare_config_and_inputs_for_pretraining(self):
-        pixel_values = floats_tensor(
-            [self.batch_size, self.num_frames, self.num_image_channels, self.image_size, self.image_size]
-        )
-        audio_values = floats_tensor(
-            [self.batch_size, self.num_audio_channels, self.spectrogram_length, self.frequency_length]
-        )
-
-        pixel_mask = floats_tensor([self.batch_size, self.expected_pixel_seq_len])
-        audio_mask = floats_tensor([self.batch_size, self.expected_audio_seq_len])
-
-        pixel_values_mixed = floats_tensor(
-            [self.batch_size, self.num_frames, self.num_image_channels, self.image_size, self.image_size]
-        )
-        pixel_mask_mixed = floats_tensor([self.batch_size, self.expected_pixel_seq_len])
-        labels = floats_tensor([self.batch_size])
-        config = self.get_config()
-
-        return (
-            config,
-            pixel_values,
-            audio_values,
-            pixel_mask,
-            audio_mask,
-            pixel_values_mixed,
-            pixel_mask_mixed,
-            labels,
-        )
-
-    def get_config(self):
-        return TvltConfig(
-            image_size=self.image_size,
-            spectrogram_length=self.spectrogram_length,
-            frequency_length=self.frequency_length,
-            image_patch_size=self.image_patch_size,
-            audio_patch_size=self.audio_patch_size,
-            num_image_channels=self.num_image_channels,
-            num_audio_channels=self.num_audio_channels,
-            num_frames=self.num_frames,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            initializer_range=self.initializer_range,
-            layer_norm_eps=self.layer_norm_eps,
-            qkv_bias=self.qkv_bias,
-            use_mean_pooling=self.use_mean_pooling,
-            decoder_num_attention_heads=self.decoder_num_attention_heads,
-            decoder_hidden_size=self.decoder_hidden_size,
-            decoder_num_hidden_layers=self.decoder_num_hidden_layers,
-            decoder_intermediate_size=self.decoder_intermediate_size,
-            image_mask_ratio=self.image_mask_ratio,
-            audio_mask_ratio=self.audio_mask_ratio,
-            task_matching=self.task_matching,
-            task_mae=self.task_mae,
-            num_labels=self.num_labels,
-        )
-
-    def create_and_check_model(self, config, pixel_values, audio_values, pixel_mask, audio_mask):
-        model = TvltModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values, audio_values, pixel_mask=pixel_mask, audio_mask=audio_mask)
-        result = model(pixel_values, audio_values)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.expected_seq_len, self.hidden_size)
-        )
-
-    def create_and_check_for_audiovisual_classification(
-        self, config, pixel_values, audio_values, pixel_mask, audio_mask
-    ):
-        model = TvltForAudioVisualClassification(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values, audio_values, pixel_mask=pixel_mask, audio_mask=audio_mask)
-        result = model(pixel_values, audio_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_pretraining(
-        self,
-        config,
-        pixel_values,
-        audio_values,
-        pixel_mask,
-        audio_mask,
-        pixel_values_mixed,
-        pixel_mask_mixed,
-        labels,
-    ):
-        model = TvltForPreTraining(config=config)
-        model.to(torch_device)
-        model.train()
-        result = model(
-            pixel_values,
-            audio_values,
-            pixel_mask,
-            audio_mask,
-            pixel_values_mixed=pixel_values_mixed,
-            pixel_mask_mixed=pixel_mask_mixed,
-            labels=labels,
-        )
-        self.parent.assertEqual(
-            result.pixel_logits.shape, (self.batch_size, self.expected_pixel_seq_len, self.image_mae_output_dim)
-        )
-        self.parent.assertEqual(
-            result.audio_logits.shape, (self.batch_size, self.expected_audio_seq_len, self.audio_mae_output_dim)
-        )
-        self.parent.assertEqual(result.matching_logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_pretraining_inference(
-        self,
-        config,
-        pixel_values,
-        audio_values,
-        pixel_mask,
-        audio_mask,
-        pixel_values_mixed,
-        pixel_mask_mixed,
-        labels,
-    ):
-        model = TvltForPreTraining(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            pixel_values,
-            audio_values,
-            pixel_mask,
-            audio_mask,
-            pixel_values_mixed=pixel_values_mixed,
-            pixel_mask_mixed=pixel_mask_mixed,
-            labels=labels,
-        )
-        if result.pixel_logits is not None:
-            self.parent.assertEqual(
-                result.pixel_logits.shape, (self.batch_size, self.expected_pixel_seq_len, self.image_mae_output_dim)
-            )
-        if result.audio_logits is not None:
-            self.parent.assertEqual(
-                result.audio_logits.shape, (self.batch_size, self.expected_audio_seq_len, self.audio_mae_output_dim)
-            )
-        self.parent.assertEqual(result.matching_logits.shape, (self.batch_size, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (config, pixel_values, audio_values, pixel_mask, audio_mask) = config_and_inputs
-        inputs_dict = {
-            "pixel_values": pixel_values,
-            "audio_values": audio_values,
-            "pixel_mask": pixel_mask,
-            "audio_mask": audio_mask,
-        }
-        return config, inputs_dict
-
-    def prepare_pixel_values(self):
-        return floats_tensor(
-            [self.batch_size, self.num_frames, self.num_image_channels, self.image_size, self.image_size]
-        )
-
-    def prepare_audio_values(self):
-        return floats_tensor(
-            [self.batch_size, self.num_audio_channels, self.spectrogram_length, self.frequency_length]
-        )
-
-
-@require_torch
-class TvltModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (TvltModel, TvltForPreTraining, TvltForAudioVisualClassification) if is_torch_available() else ()
-    )
-    pipeline_model_mapping = {"feature-extraction": TvltModel} if is_torch_available() else {}
-
-    fx_compatible = False
-    test_pruning = False
-    test_headmasking = False
-    test_torchscript = False
-    test_resize_embeddings = False
-    main_input_name = "pixel_values"
-
-    # TvltForAudioVisualClassification and TvltForPreTraining require special treatment
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=True):
-        inputs_dict = copy.deepcopy(inputs_dict)
-
-        if return_labels:
-            if model_class.__name__ == "TvltForAudioVisualClassification":
-                inputs_dict["labels"] = torch.zeros(
-                    (self.model_tester.batch_size,), dtype=torch.long, device=torch_device
-                )
-            elif model_class.__name__ == "TvltForPreTraining":
-                inputs_dict["labels"] = torch.zeros(
-                    (self.model_tester.batch_size,), dtype=torch.float, device=torch_device
-                )
-                inputs_dict["pixel_values_mixed"] = torch.zeros(
-                    (
-                        self.model_tester.batch_size,
-                        self.model_tester.num_frames,
-                        self.model_tester.num_image_channels,
-                        self.model_tester.image_size,
-                        self.model_tester.image_size,
-                    ),
-                    dtype=torch.float,
-                    device=torch_device,
-                )
-                inputs_dict["pixel_mask_mixed"] = torch.zeros(
-                    (self.model_tester.batch_size, self.model_tester.expected_pixel_seq_len),
-                    dtype=torch.float,
-                    device=torch_device,
-                )
-
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = TvltModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=TvltConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="TVLT does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_common_attributes(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            input_embeddings = model.get_input_embeddings()
-            self.assertIsInstance(input_embeddings, (tuple))
-            for embedding in input_embeddings:
-                self.assertIsInstance(embedding, (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values", "audio_values"]
-            self.assertListEqual(arg_names[:2], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_audiovisual_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_audiovisual_classification(*config_and_inputs)
-
-    def test_for_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_pretraining()
-        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
-        self.model_tester.create_and_check_for_pretraining_inference(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "ZinengTang/tvlt-base"
-        model = TvltModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_training(self):
-        if not self.model_tester.is_training:
-            return
-
-        for model_class in self.all_model_classes[1:]:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.return_dict = True
-
-            model = model_class(config)
-            model.to(torch_device)
-            model.train()
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-            for k, v in inputs.items():
-                print(k, v.shape)
-            loss = model(**inputs).loss
-            loss.backward()
-
-    def test_training_gradient_checkpointing(self):
-        if not self.model_tester.is_training:
-            return
-
-        for model_class in self.all_model_classes[1:]:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.use_cache = False
-            config.return_dict = True
-
-            model = model_class(config)
-            model.to(torch_device)
-            model.gradient_checkpointing_enable()
-            model.train()
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-            loss = model(**inputs).loss
-            loss.backward()
-
-    def test_attention_outputs(self):
-        if not self.has_attentions:
-            pass
-
-        else:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.return_dict = True
-
-            for model_class in self.all_model_classes[2:]:
-                seq_len = self.model_tester.expected_seq_len
-
-                inputs_dict["output_attentions"] = True
-                inputs_dict["output_hidden_states"] = False
-                config.return_dict = True
-                model = model_class(config)
-                model.to(torch_device)
-                model.eval()
-                with torch.no_grad():
-                    outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-                attentions = outputs.attentions
-                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-                # check that output_attentions also work using config
-                del inputs_dict["output_attentions"]
-                config.output_attentions = True
-                model = model_class(config)
-                model.to(torch_device)
-                model.eval()
-                with torch.no_grad():
-                    outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-                attentions = outputs.attentions
-                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-                self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, seq_len, seq_len],
-                )
-                out_len = len(outputs)
-
-                # Check attention is always last and order is fine
-                inputs_dict["output_attentions"] = True
-                inputs_dict["output_hidden_states"] = True
-                model = model_class(config)
-                model.to(torch_device)
-                model.eval()
-                with torch.no_grad():
-                    outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-                self.assertEqual(out_len + 1, len(outputs))
-
-                self_attentions = outputs.attentions
-
-                self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(self_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, seq_len, seq_len],
-                )
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.hidden_states
-            expected_num_layers = self.model_tester.num_hidden_layers + 1
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            seq_length = self.model_tester.expected_seq_len
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes[2:]:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-
-# We will verify our results on a video of eating spaghetti
-# Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
-def prepare_video(num_frames=8):
-    file = hf_hub_download(
-        repo_id="hf-internal-testing/spaghetti-video", filename="eating_spaghetti.npy", repo_type="dataset"
-    )
-    video = np.load(file)[:num_frames]
-    return list(video)
-
-
-def prepare_audio(num_samples=1):
-    ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-    # automatic decoding with librispeech
-    speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
-    return [x["array"] for x in speech_samples]
-
-
-@require_torch
-@require_vision
-class TvltModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_processors(self):
-        # logits were tested with a different mean and std, so we use the same here
-        return (
-            TvltImageProcessor() if is_vision_available() else None,
-            TvltFeatureExtractor(),
-        )
-
-    def test_inference_for_base_model(self):
-        model = TvltModel.from_pretrained("ZinengTang/tvlt-base").to(torch_device)
-
-        image_processor, audio_feature_extractor = self.default_processors
-        video = prepare_video()
-        audio = prepare_audio()
-        video_inputs = image_processor(video, return_tensors="pt").to(torch_device)
-        audio_inputs = audio_feature_extractor(audio, return_tensors="pt").to(torch_device)
-        inputs = {}
-        inputs.update(video_inputs)
-        inputs.update(audio_inputs)
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_last_hidden_state_slice = torch.tensor([[-0.0186, -0.0691], [0.0242, -0.0398]], device=torch_device)
-        self.assertTrue(
-            torch.allclose(outputs.last_hidden_state[:, :2, :2], expected_last_hidden_state_slice, atol=1e-4)
-        )
-
-    def test_inference_for_pretraining(self):
-        model = TvltForPreTraining.from_pretrained("ZinengTang/tvlt-base").to(torch_device)
-
-        image_processor, audio_feature_extractor = self.default_processors
-        video = prepare_video()
-        video_mixed = prepare_video()
-        audio = prepare_audio()
-        video_inputs = image_processor(video, return_tensors="pt", mask_pixel=True).to(torch_device)
-        video_mixed_inputs = image_processor(video_mixed, is_mixed=True, return_tensors="pt").to(torch_device)
-        audio_inputs = audio_feature_extractor(audio, return_tensors="pt", mask_audio=True).to(torch_device)
-        labels = torch.tensor([[0.0]], device=torch_device)
-        inputs = {}
-        inputs.update(video_inputs)
-        inputs.update(video_mixed_inputs)
-        inputs.update(audio_inputs)
-        inputs.update({"labels": labels})
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_pixel_logits_shape = torch.Size([1, 1568, 768])
-        expected_audio_logits_shape = torch.Size([1, 96, 256])
-        expected_matching_logits_shape = torch.Size([1, 1])
-
-        if outputs.pixel_logits is not None:
-            self.assertEqual(outputs.pixel_logits.shape, expected_pixel_logits_shape)
-        if outputs.audio_logits is not None:
-            self.assertEqual(outputs.audio_logits.shape, expected_audio_logits_shape)
-        self.assertTrue(outputs.matching_logits.shape, expected_matching_logits_shape)
diff --git a/tests/models/tvlt/test_processor_tvlt.py b/tests/models/tvlt/test_processor_tvlt.py
deleted file mode 100644
index 83f59860fee4da..00000000000000
--- a/tests/models/tvlt/test_processor_tvlt.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import shutil
-import tempfile
-import unittest
-
-import numpy as np
-import pytest
-
-from transformers import is_speech_available, is_vision_available
-from transformers.testing_utils import require_torch
-
-
-if is_vision_available():
-    from transformers import TvltImageProcessor
-
-if is_speech_available():
-    from transformers import TvltFeatureExtractor
-
-from transformers import TvltProcessor
-
-
-@require_torch
-class TvltProcessorTest(unittest.TestCase):
-    def setUp(self):
-        self.checkpoint = "ZinengTang/tvlt-base"
-        self.tmpdirname = tempfile.mkdtemp()
-
-    def get_image_processor(self, **kwargs):
-        return TvltImageProcessor.from_pretrained(self.checkpoint, **kwargs)
-
-    def get_feature_extractor(self, **kwargs):
-        return TvltFeatureExtractor.from_pretrained(self.checkpoint, **kwargs)
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-    def test_save_load_pretrained_default(self):
-        image_processor = self.get_image_processor()
-        feature_extractor = self.get_feature_extractor()
-
-        processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
-        processor.save_pretrained(self.tmpdirname)
-        processor = TvltProcessor.from_pretrained(self.tmpdirname)
-
-        self.assertIsInstance(processor.feature_extractor, TvltFeatureExtractor)
-        self.assertIsInstance(processor.image_processor, TvltImageProcessor)
-
-    def test_feature_extractor(self):
-        image_processor = self.get_image_processor()
-        feature_extractor = self.get_feature_extractor()
-
-        processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
-
-        audio = np.ones([12000])
-
-        audio_dict = feature_extractor(audio, return_tensors="np")
-        input_processor = processor(audio=audio, return_tensors="np")
-
-        for key in audio_dict.keys():
-            self.assertAlmostEqual(audio_dict[key].sum(), input_processor[key].sum(), delta=1e-2)
-
-    def test_image_processor(self):
-        image_processor = self.get_image_processor()
-        feature_extractor = self.get_feature_extractor()
-
-        processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
-
-        images = np.ones([3, 224, 224])
-
-        image_dict = image_processor(images, return_tensors="np")
-        input_processor = processor(images=images, return_tensors="np")
-
-        for key in image_dict.keys():
-            self.assertAlmostEqual(image_dict[key].sum(), input_processor[key].sum(), delta=1e-2)
-
-    def test_processor(self):
-        image_processor = self.get_image_processor()
-        feature_extractor = self.get_feature_extractor()
-
-        processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
-
-        audio = np.ones([12000])
-        images = np.ones([3, 224, 224])
-
-        inputs = processor(audio=audio, images=images)
-
-        self.assertListEqual(list(inputs.keys()), ["audio_values", "audio_mask", "pixel_values", "pixel_mask"])
-
-        # test if it raises when no input is passed
-        with pytest.raises(ValueError):
-            processor()
-
-    def test_model_input_names(self):
-        image_processor = self.get_image_processor()
-        feature_extractor = self.get_feature_extractor()
-
-        processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
-
-        self.assertListEqual(
-            processor.model_input_names,
-            image_processor.model_input_names + feature_extractor.model_input_names,
-            msg="`processor` and `image_processor`+`feature_extractor` model input names do not match",
-        )
diff --git a/tests/models/vit_hybrid/__init__.py b/tests/models/vit_hybrid/__init__.py
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
deleted file mode 100644
index 043dcb4de4504a..00000000000000
--- a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
+++ /dev/null
@@ -1,281 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch ViT Hybrid model."""
-
-import unittest
-
-from transformers import ViTHybridConfig
-from transformers.testing_utils import is_flaky, require_accelerate, require_torch, require_vision, slow, torch_device
-from transformers.utils import cached_property, is_torch_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-    from torch import nn
-
-    from transformers import ViTHybridForImageClassification, ViTHybridImageProcessor, ViTHybridModel
-
-
-if is_vision_available():
-    from PIL import Image
-
-
-class ViTHybridModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=64,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        type_sequence_label_size=10,
-        initializer_range=0.02,
-        backbone_featmap_shape=[1, 16, 4, 4],
-        scope=None,
-        attn_implementation="eager",
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.scope = scope
-        self.backbone_featmap_shape = backbone_featmap_shape
-        self.attn_implementation = attn_implementation
-
-        # in ViT hybrid, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        # the number of patches is based on the feature map of the backbone, which by default uses an output stride
-        # of 32, which means that the feature map has a spatial resolution of 1/32 of the input image size
-        num_patches = (self.image_size // 32) ** 2
-        self.seq_length = num_patches + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        backbone_config = {
-            "global_padding": "same",
-            "layer_type": "bottleneck",
-            "depths": [3, 4, 9],
-            "out_features": ["stage1", "stage2", "stage3"],
-            "embedding_dynamic_padding": True,
-            "hidden_sizes": [4, 8, 16, 32],
-            "num_groups": 2,
-        }
-
-        return ViTHybridConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            backbone_featmap_shape=self.backbone_featmap_shape,
-            backbone_config=backbone_config,
-            backbone=None,
-            attn_implementation=self.attn_implementation,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = ViTHybridModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.type_sequence_label_size
-        model = ViTHybridForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_torch
-class ViTHybridModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as ViT does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (ViTHybridModel, ViTHybridForImageClassification) if is_torch_available() else ()
-    pipeline_model_mapping = (
-        {"image-feature-extraction": ViTHybridModel, "image-classification": ViTHybridForImageClassification}
-        if is_torch_available()
-        else {}
-    )
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    model_split_percents = [0.5, 0.9]
-
-    def setUp(self):
-        self.model_tester = ViTHybridModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ViTHybridConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="ViT does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_common_attributes(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            # Skip the check for the backbone
-            for name, module in model.named_modules():
-                if module.__class__.__name__ == "ViTHybridPatchEmbeddings":
-                    backbone_params = [f"{name}.{key}" for key in module.state_dict().keys()]
-                    break
-
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if name in backbone_params:
-                        continue
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google/vit-hybrid-base-bit-384"
-        model = ViTHybridModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516")
-    def test_batching_equivalence(self):
-        super().test_batching_equivalence()
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_torch
-@require_vision
-class ViTModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return (
-            ViTHybridImageProcessor.from_pretrained("google/vit-hybrid-base-bit-384")
-            if is_vision_available()
-            else None
-        )
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = ViTHybridForImageClassification.from_pretrained("google/vit-hybrid-base-bit-384").to(torch_device)
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = torch.Size((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = torch.tensor([-1.9090, -0.4993, -0.2389]).to(torch_device)
-
-        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
-
-    @slow
-    @require_accelerate
-    def test_accelerate_inference(self):
-        image_processor = ViTHybridImageProcessor.from_pretrained("google/vit-hybrid-base-bit-384")
-        model = ViTHybridForImageClassification.from_pretrained("google/vit-hybrid-base-bit-384", device_map="auto")
-
-        image = prepare_img()
-
-        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
-        outputs = model(**inputs)
-        logits = outputs.logits
-        # model predicts one of the 1000 ImageNet classes
-        predicted_class_idx = logits.argmax(-1).item()
-
-        self.assertTrue(model.config.id2label[predicted_class_idx], "tabby, tabby cat")
diff --git a/tests/models/xlm_prophetnet/__init__.py b/tests/models/xlm_prophetnet/__init__.py
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/tests/models/xlm_prophetnet/test_modeling_xlm_prophetnet.py b/tests/models/xlm_prophetnet/test_modeling_xlm_prophetnet.py
deleted file mode 100644
index 614ccd6ebc2b36..00000000000000
--- a/tests/models/xlm_prophetnet/test_modeling_xlm_prophetnet.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team, The Microsoft Research team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import is_torch_available
-from transformers.testing_utils import require_torch, slow, torch_device
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import XLMProphetNetForConditionalGeneration, XLMProphetNetTokenizer
-
-
-@require_torch
-class XLMProphetNetModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_pretrained_checkpoint_hidden_states(self):
-        model = XLMProphetNetForConditionalGeneration.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
-        model.to(torch_device)
-
-        # encoder-decoder outputs
-        encoder_ids = torch.tensor([[17, 96208, 103471, 2]]).to(torch_device)
-        decoder_prev_ids = torch.tensor(
-            [[2, 250, 9953, 34, 69489, 1620, 32, 118424, 624, 210, 105, 2913, 1032, 351]]
-        ).to(torch_device)
-        output = model(
-            input_ids=encoder_ids, attention_mask=None, encoder_outputs=None, decoder_input_ids=decoder_prev_ids
-        )
-        output_predited_logis = output[0]
-        expected_shape = torch.Size((1, 14, 250012))
-        self.assertEqual(output_predited_logis.shape, expected_shape)
-        expected_slice = torch.tensor(
-            [[[-6.3986, -8.2391, 12.5189], [-6.3289, -8.0864, 12.6211], [-6.2418, -8.0445, 12.7968]]]
-        ).to(torch_device)
-        self.assertTrue(torch.allclose(output_predited_logis[:, :3, :3], expected_slice, atol=1e-4))
-
-        # encoder outputs
-        encoder_outputs = model.prophetnet.encoder(encoder_ids)[0]
-        expected_encoder_outputs_slice = torch.tensor(
-            [[[-1.4260, -0.7628, 0.8453], [-1.4719, -0.1391, 0.7807], [-1.7678, 0.0114, 0.4646]]]
-        ).to(torch_device)
-        expected_shape_encoder = torch.Size((1, 4, 1024))
-        self.assertEqual(encoder_outputs.shape, expected_shape_encoder)
-        self.assertTrue(torch.allclose(encoder_outputs[:, :3, :3], expected_encoder_outputs_slice, atol=1e-4))
-
-        # decoder outputs
-        decoder_outputs = model.prophetnet.decoder(
-            decoder_prev_ids,
-            encoder_hidden_states=encoder_outputs,
-        )
-        predicting_streams = decoder_outputs[1].view(1, model.config.ngram, 14, -1)
-        predicting_streams_logits = model.lm_head(predicting_streams)
-        next_first_stream_logits = predicting_streams_logits[:, 0]
-        self.assertTrue(torch.allclose(next_first_stream_logits[:, :3, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_ntg_hidden_states(self):
-        model = XLMProphetNetForConditionalGeneration.from_pretrained(
-            "microsoft/xprophetnet-large-wiki100-cased-xglue-ntg"
-        )
-        model.to(torch_device)
-
-        encoder_ids = torch.tensor([[17, 96208, 103471, 2]]).to(torch_device)
-        decoder_prev_ids = torch.tensor(
-            [[2, 250, 9953, 34, 69489, 1620, 32, 118424, 624, 210, 105, 2913, 1032, 351]]
-        ).to(torch_device)
-        output = model(
-            input_ids=encoder_ids, attention_mask=None, encoder_outputs=None, decoder_input_ids=decoder_prev_ids
-        )
-        output_predited_logis = output[0]
-        expected_shape = torch.Size((1, 14, 250012))
-        self.assertEqual(output_predited_logis.shape, expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = torch.tensor(
-            [[[-9.2253, -9.7173, -6.3529], [-7.6701, -9.0145, -1.9382], [-8.0195, -7.0004, -0.1523]]]
-        ).to(torch_device)
-
-        self.assertTrue(torch.allclose(output_predited_logis[:, :3, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_xprophetnet_ntg_inference(self):
-        model = XLMProphetNetForConditionalGeneration.from_pretrained(
-            "microsoft/xprophetnet-large-wiki100-cased-xglue-ntg"
-        )
-        model.to(torch_device)
-        model.config.max_length = 512
-
-        tokenizer = XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased-xglue-ntg")
-
-        EN_SENTENCE = (
-            "Microsoft Corporation intends to officially end free support for the Windows 7 operating system after"
-            " January 14, 2020, according to the official portal of the organization. From that day, users of this"
-            " system will not be able to receive security updates, which could make their computers vulnerable to"
-            " cyber attacks."
-        )
-        RU_SENTENCE = (
-            "орпорация Microsoft намерена официально прекратить бесплатную поддержку операционной системы Windows 7"
-            " после 14 января 2020 года, сообщается на официальном портале организации . С указанного дня пользователи"
-            " этой системы не смогут получать обновления безопасности, из-за чего их компьютеры могут стать уязвимыми"
-            " к кибератакам."
-        )
-        ZH_SENTENCE = "根据该组织的官方门户网站，微软公司打算在2020年1月14日之后正式终止对Windows 7操作系统的免费支持。从那时起，该系统的用户将无法接收安全更新，这可能会使他们的计算机容易受到网络攻击。"
-
-        input_ids = tokenizer(
-            [EN_SENTENCE, RU_SENTENCE, ZH_SENTENCE], padding=True, max_length=255, return_tensors="pt"
-        ).input_ids
-        input_ids = input_ids.to(torch_device)
-
-        summary_ids = model.generate(
-            input_ids, num_beams=10, length_penalty=1.0, no_repeat_ngram_size=3, early_stopping=True
-        )
-        generated_titles = [tokenizer.decode(g, skip_special_tokens=True) for g in summary_ids]
-        EXPECTED_TITLE_EN = "Microsoft to end Windows 7 free support after January 14, 2020"
-        EXPECTED_TITLE_RU = "Microsoft намерена прекратить бесплатную поддержку Windows 7 после 14 января 2020 года"
-        EXPECTED_TITLE_ZH = "微软打算终止对Windows 7操作系统的免费支持"
-        self.assertListEqual(
-            [EXPECTED_TITLE_EN, EXPECTED_TITLE_RU, EXPECTED_TITLE_ZH],
-            generated_titles,
-        )
-
-        summary_ids_beam1 = model.generate(
-            input_ids, num_beams=1, length_penalty=1.0, no_repeat_ngram_size=3, early_stopping=True
-        )
-        generated_titles_beam1_tok = [
-            tokenizer.convert_ids_to_tokens(g, skip_special_tokens=True) for g in summary_ids_beam1
-        ]
-        EXPECTED_TITLE_EN_BEAM1_TOK = "▁Microsoft ▁to ▁end ▁free ▁support ▁for ▁Windows ▁7".split(" ")
-        EXPECTED_TITLE_RU_BEAM1_TOK = "▁Microsoft ▁намерен а ▁прекрати ть ▁бес плат ную ▁поддержку ▁Windows ▁7 ▁после ▁14 ▁января ▁2020 ▁года".split(
-            " "
-        )
-        EXPECTED_TITLE_ZH_BEAM1_TOK = "微软 公司 打算 终止 对 Windows ▁7 操作 系统的 免费 支持".split(" ")
-        self.assertListEqual(
-            [EXPECTED_TITLE_EN_BEAM1_TOK, EXPECTED_TITLE_RU_BEAM1_TOK, EXPECTED_TITLE_ZH_BEAM1_TOK],
-            generated_titles_beam1_tok,
-        )
diff --git a/tests/models/xlm_prophetnet/test_tokenization_xlm_prophetnet.py b/tests/models/xlm_prophetnet/test_tokenization_xlm_prophetnet.py
deleted file mode 100644
index cadcc600490cce..00000000000000
--- a/tests/models/xlm_prophetnet/test_tokenization_xlm_prophetnet.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team, The Microsoft Research team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from transformers.models.xlm_prophetnet.tokenization_xlm_prophetnet import SPIECE_UNDERLINE, XLMProphetNetTokenizer
-from transformers.testing_utils import get_tests_dir, require_sentencepiece, slow
-from transformers.utils import cached_property
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
-
-
-@require_sentencepiece
-class XLMProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "microsoft/xprophetnet-large-wiki100-cased"
-    tokenizer_class = XLMProphetNetTokenizer
-    test_rust_tokenizer = False
-    test_sentencepiece = True
-
-    def setUp(self):
-        super().setUp()
-
-        # We have a SentencePiece fixture for testing
-        tokenizer = XLMProphetNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
-        tokenizer.save_pretrained(self.tmpdirname)
-
-    def test_convert_token_and_id(self):
-        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
-        token = "[PAD]"
-        token_id = 0
-
-        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
-        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
-
-    def test_get_vocab(self):
-        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
-
-        self.assertEqual(vocab_keys[0], "[PAD]")
-        self.assertEqual(vocab_keys[1], "[CLS]")
-        self.assertEqual(vocab_keys[-1], "j")
-        self.assertEqual(len(vocab_keys), 1_012)
-
-    def test_vocab_size(self):
-        self.assertEqual(self.get_tokenizer().vocab_size, 1_012)
-
-    def test_full_tokenizer(self):
-        tokenizer = XLMProphetNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
-
-        tokens = tokenizer.tokenize("This is a test")
-        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
-
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(tokens),
-            [value + tokenizer.fairseq_offset for value in [285, 46, 10, 170, 382]],
-        )
-
-        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
-        self.assertListEqual(
-            tokens,
-            [
-                SPIECE_UNDERLINE + "I",
-                SPIECE_UNDERLINE + "was",
-                SPIECE_UNDERLINE + "b",
-                "or",
-                "n",
-                SPIECE_UNDERLINE + "in",
-                SPIECE_UNDERLINE + "",
-                "9",
-                "2",
-                "0",
-                "0",
-                "0",
-                ",",
-                SPIECE_UNDERLINE + "and",
-                SPIECE_UNDERLINE + "this",
-                SPIECE_UNDERLINE + "is",
-                SPIECE_UNDERLINE + "f",
-                "al",
-                "s",
-                "é",
-                ".",
-            ],
-        )
-        ids = tokenizer.convert_tokens_to_ids(tokens)
-        self.assertListEqual(
-            ids,
-            [
-                value + tokenizer.fairseq_offset
-                for value in [8, 21, 84, 55, 24, 19, 7, -9, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, -9, 4]
-            ],
-        )
-
-        back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(
-            back_tokens,
-            [
-                SPIECE_UNDERLINE + "I",
-                SPIECE_UNDERLINE + "was",
-                SPIECE_UNDERLINE + "b",
-                "or",
-                "n",
-                SPIECE_UNDERLINE + "in",
-                SPIECE_UNDERLINE + "",
-                "[UNK]",
-                "2",
-                "0",
-                "0",
-                "0",
-                ",",
-                SPIECE_UNDERLINE + "and",
-                SPIECE_UNDERLINE + "this",
-                SPIECE_UNDERLINE + "is",
-                SPIECE_UNDERLINE + "f",
-                "al",
-                "s",
-                "[UNK]",
-                ".",
-            ],
-        )
-
-    @cached_property
-    def big_tokenizer(self):
-        return XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
-
-    @slow
-    def test_tokenization_base_easy_symbols(self):
-        symbols = "Hello World!"
-        original_tokenizer_encodings = [35389, 6672, 49, 2]
-        self.assertListEqual(original_tokenizer_encodings, self.big_tokenizer.encode(symbols))
-
-    @slow
-    def test_tokenizer_integration(self):
-        expected_encoding = {'input_ids': [[11073, 82783, 18, 26, 82783, 549, 51540, 248, 17209, 1301, 217, 20, 215186, 1325, 147, 17209, 1301, 217, 20, 56370, 53, 122020, 20, 16477, 27, 87355, 4548, 20, 4728, 78392, 17, 159969, 18, 26, 24491, 629, 15, 538, 22704, 5439, 15, 2788, 24491, 9885, 15, 43534, 605, 15, 814, 18403, 33200, 29, 15, 43534, 24458, 12410, 111, 24966, 83669, 9637, 144068, 26, 850, 22346, 27, 147, 24966, 83669, 83490, 26, 39113, 735, 27, 689, 656, 2800, 1339, 4600, 53, 122020, 115785, 34, 816, 1339, 46887, 18, 147, 53905, 1951, 42238, 41170, 17732, 834, 436, 15, 27523, 98733, 217, 147, 5542, 4981, 930, 17347, 16, 2], [20091, 629, 94, 82786, 58, 490, 20, 1528, 84, 53905, 344, 80592, 110128, 18822, 5267, 1306, 62, 152537, 308, 7997, 401, 124427, 549, 35442, 225, 109, 15055, 25748, 147, 7119, 43712, 34, 767, 135366, 18, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [592, 63784, 119466, 17, 147808, 88214, 18, 656, 81, 32, 3296, 10280, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # fmt: skip
-
-        self.tokenizer_integration_test_util(
-            expected_encoding=expected_encoding,
-            model_name="microsoft/xprophetnet-large-wiki100-cased",
-            revision="1acad1643ddd54a44df6a1b797ada8373685d90e",
-        )
diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
index f631c59b75d40e..e6edcf517a0936 100644
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -61,8 +61,6 @@
     # `ignore_value` used during training (despite we don't have training script for these models yet)
     # `norm` used in conversion script (despite not using in the modeling file)
     "OneFormerConfig": ["ignore_value", "norm"],
-    # used during preprocessing and collation, see `collating_graphormer.py`
-    "GraphormerConfig": ["spatial_pos_max"],
     # used internally in the configuration class file
     "T5Config": ["feed_forward_proj"],
     # used internally in the configuration class file
@@ -134,20 +132,16 @@
     {
         "CLIPSegConfig": True,
         "DeformableDetrConfig": True,
-        "DetaConfig": True,
         "DinatConfig": True,
         "DonutSwinConfig": True,
-        "EfficientFormerConfig": True,
         "FastSpeech2ConformerConfig": True,
         "FSMTConfig": True,
-        "JukeboxConfig": True,
         "LayoutLMv2Config": True,
         "MaskFormerSwinConfig": True,
         "MT5Config": True,
         # For backward compatibility with trust remote code models
         "MptConfig": True,
         "MptAttentionConfig": True,
-        "NatConfig": True,
         "OneFormerConfig": True,
         "PerceiverConfig": True,
         "RagConfig": True,
diff --git a/utils/deprecate_models.py b/utils/deprecate_models.py
index 2307f997202ce0..23308e91a767fc 100644
--- a/utils/deprecate_models.py
+++ b/utils/deprecate_models.py
@@ -45,14 +45,14 @@ def get_last_stable_minor_release():
 def build_tip_message(last_stable_release):
     return (
         """
-    <Tip warning={true}>
+<Tip warning={true}>
 
-    This model is in maintenance mode only, we don't accept any new PRs changing its code.
-    """
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
+"""
         + f"""If you run into any issues running this model, please reinstall the last version that supported this model: v{last_stable_release}.
-    You can do so by running the following command: `pip install -U transformers=={last_stable_release}`.
+You can do so by running the following command: `pip install -U transformers=={last_stable_release}`.
 
-    </Tip>"""
+</Tip>"""
     )
 
 
@@ -164,7 +164,8 @@ def update_main_init_file(models):
 
     # 1. For each model, find all the instances of model.model_name and replace with model.deprecated.model_name
     for model in models:
-        init_file = init_file.replace(f"models.{model}", f"models.deprecated.{model}")
+        init_file = init_file.replace(f'models.{model}"', f'models.deprecated.{model}"')
+        init_file = init_file.replace(f"models.{model} import", f"models.deprecated.{model} import")
 
     with open(filename, "w") as f:
         f.write(init_file)
@@ -265,14 +266,14 @@ def add_models_to_deprecated_models_in_config_auto(models):
         elif in_deprecated_models and line.strip() == "]":
             in_deprecated_models = False
             # Add the new models to deprecated models list
-            deprecated_models_list.extend([f'"{model},"' for model in models])
+            deprecated_models_list.extend([f'    "{model}", ' for model in models])
             # Sort so they're in alphabetical order in the file
             deprecated_models_list = sorted(deprecated_models_list)
             new_file_lines.extend(deprecated_models_list)
             # Make sure we still have the closing bracket
             new_file_lines.append(line)
         elif in_deprecated_models:
-            deprecated_models_list.append(line.strip())
+            deprecated_models_list.append(line)
         else:
             new_file_lines.append(line)
 
diff --git a/utils/not_doctested.txt b/utils/not_doctested.txt
index 3ffc2740606ef4..0f9334a85678f8 100644
--- a/utils/not_doctested.txt
+++ b/utils/not_doctested.txt
@@ -520,8 +520,6 @@ src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl_utilities.py
 src/transformers/models/deprecated/van/configuration_van.py
 src/transformers/models/deprecated/van/convert_van_to_pytorch.py
 src/transformers/models/deprecated/van/modeling_van.py
-src/transformers/models/deta/convert_deta_resnet_to_pytorch.py
-src/transformers/models/deta/convert_deta_swin_to_pytorch.py
 src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py
 src/transformers/models/detr/convert_detr_to_pytorch.py
 src/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py
@@ -541,9 +539,6 @@ src/transformers/models/dpr/modeling_tf_dpr.py
 src/transformers/models/dpt/configuration_dpt.py
 src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
 src/transformers/models/dpt/convert_dpt_to_pytorch.py
-src/transformers/models/efficientformer/configuration_efficientformer.py
-src/transformers/models/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py
-src/transformers/models/efficientformer/modeling_efficientformer.py
 src/transformers/models/efficientnet/configuration_efficientnet.py
 src/transformers/models/efficientnet/convert_efficientnet_to_pytorch.py
 src/transformers/models/efficientnet/modeling_efficientnet.py
@@ -611,12 +606,6 @@ src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py
 src/transformers/models/gptj/configuration_gptj.py
 src/transformers/models/gptj/modeling_flax_gptj.py
 src/transformers/models/gptj/modeling_tf_gptj.py
-src/transformers/models/gptsan_japanese/configuration_gptsan_japanese.py
-src/transformers/models/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py
-src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
-src/transformers/models/graphormer/collating_graphormer.py
-src/transformers/models/graphormer/configuration_graphormer.py
-src/transformers/models/graphormer/modeling_graphormer.py
 src/transformers/models/groupvit/configuration_groupvit.py
 src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py
 src/transformers/models/hubert/configuration_hubert.py
@@ -642,9 +631,6 @@ src/transformers/models/instructblip/modeling_instructblip.py
 src/transformers/models/instructblip/processing_instructblip.py
 src/transformers/models/jamba/configuration_jamba.py
 src/transformers/models/jamba/modeling_jamba.py
-src/transformers/models/jukebox/configuration_jukebox.py
-src/transformers/models/jukebox/convert_jukebox.py
-src/transformers/models/jukebox/modeling_jukebox.py
 src/transformers/models/kosmos2/convert_kosmos2_original_pytorch_checkpoint_to_pytorch.py
 src/transformers/models/led/configuration_led.py
 src/transformers/models/led/modeling_led.py
@@ -688,9 +674,6 @@ src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py
 src/transformers/models/maskformer/modeling_maskformer_swin.py
 src/transformers/models/mbart/convert_mbart_original_checkpoint_to_pytorch.py
 src/transformers/models/mbart/modeling_flax_mbart.py
-src/transformers/models/mega/configuration_mega.py
-src/transformers/models/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py
-src/transformers/models/mega/modeling_mega.py
 src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
 src/transformers/models/megatron_bert/modeling_megatron_bert.py
 src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
@@ -725,7 +708,6 @@ src/transformers/models/mt5/modeling_tf_mt5.py
 src/transformers/models/musicgen/convert_musicgen_transformers.py
 src/transformers/models/musicgen_melody/convert_musicgen_melody_transformers.py
 src/transformers/models/mvp/modeling_mvp.py
-src/transformers/models/nezha/modeling_nezha.py
 src/transformers/models/nllb_moe/configuration_nllb_moe.py
 src/transformers/models/nllb_moe/convert_nllb_moe_sharded_original_checkpoint_to_pytorch.py
 src/transformers/models/nllb_moe/modeling_nllb_moe.py
@@ -766,8 +748,6 @@ src/transformers/models/pvt/configuration_pvt.py
 src/transformers/models/pvt/convert_pvt_to_pytorch.py
 src/transformers/models/pvt/image_processing_pvt.py
 src/transformers/models/pvt/modeling_pvt.py
-src/transformers/models/qdqbert/configuration_qdqbert.py
-src/transformers/models/qdqbert/modeling_qdqbert.py
 src/transformers/models/qwen2/configuration_qwen2.py
 src/transformers/models/qwen2/modeling_qwen2.py
 src/transformers/models/qwen2/tokenization_qwen2.py
@@ -778,8 +758,6 @@ src/transformers/models/rag/configuration_rag.py
 src/transformers/models/rag/modeling_rag.py
 src/transformers/models/rag/modeling_tf_rag.py
 src/transformers/models/rag/retrieval_rag.py
-src/transformers/models/realm/modeling_realm.py
-src/transformers/models/realm/retrieval_realm.py
 src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py
 src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py
 src/transformers/models/regnet/configuration_regnet.py
@@ -863,8 +841,6 @@ src/transformers/models/timesformer/convert_timesformer_to_pytorch.py
 src/transformers/models/timm_backbone/configuration_timm_backbone.py
 src/transformers/models/timm_backbone/modeling_timm_backbone.py
 src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py
-src/transformers/models/tvlt/configuration_tvlt.py
-src/transformers/models/tvlt/modeling_tvlt.py
 src/transformers/models/umt5/configuration_umt5.py
 src/transformers/models/umt5/convert_umt5_checkpoint_to_pytorch.py
 src/transformers/models/umt5/modeling_umt5.py
@@ -890,9 +866,6 @@ src/transformers/models/visual_bert/modeling_visual_bert.py
 src/transformers/models/vit/convert_dino_to_pytorch.py
 src/transformers/models/vit/convert_vit_timm_to_pytorch.py
 src/transformers/models/vit/modeling_flax_vit.py
-src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
-src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
-src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
 src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py
 src/transformers/models/vit_mae/modeling_tf_vit_mae.py
 src/transformers/models/vit_msn/configuration_vit_msn.py
@@ -922,8 +895,6 @@ src/transformers/models/xglm/modeling_xglm.py
 src/transformers/models/xlm/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
 src/transformers/models/xlm/modeling_tf_xlm.py
 src/transformers/models/xlm/modeling_xlm.py
-src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py
-src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
 src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py
 src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py
 src/transformers/models/xlm_roberta/modeling_xlm_roberta.py